sys/amd64/vmm/vmm.c

   1 /*-
   2  * Copyright (c) 2011 NetApp, Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD$
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 #include <sys/param.h>
  33 #include <sys/systm.h>
  34 #include <sys/kernel.h>
  35 #include <sys/module.h>
  36 #include <sys/sysctl.h>
  37 #include <sys/malloc.h>
  38 #include <sys/pcpu.h>
  39 #include <sys/lock.h>
  40 #include <sys/mutex.h>
  41 #include <sys/proc.h>
  42 #include <sys/sched.h>
  43 #include <sys/smp.h>
  44 #include <sys/systm.h>
  45
  46 #include <vm/vm.h>
  47
  48 #include <machine/vm.h>
  49 #include <machine/pcb.h>
  50 #include <machine/smp.h>
  51 #include <x86/apicreg.h>
  52
  53 #include <machine/vmm.h>
  54 #include "vmm_host.h"
  55 #include "vmm_mem.h"
  56 #include "vmm_util.h"
  57 #include <machine/vmm_dev.h>
  58 #include "vlapic.h"
  59 #include "vmm_msr.h"
  60 #include "vmm_ipi.h"
  61 #include "vmm_stat.h"
  62 #include "vmm_lapic.h"
  63
  64 #include "io/ppt.h"
  65 #include "io/iommu.h"
  66
  67 struct vlapic;
  68
  69 struct vcpu {
  70         int             flags;
  71         enum vcpu_state state;
  72         struct mtx      mtx;
  73         int             pincpu;         /* host cpuid this vcpu is bound to */
  74         int             hostcpu;        /* host cpuid this vcpu last ran on */
  75         uint64_t        guest_msrs[VMM_MSR_NUM];
  76         struct vlapic   *vlapic;
  77         int              vcpuid;
  78         struct savefpu  *guestfpu;      /* guest fpu state */
  79         void            *stats;
  80         struct vm_exit  exitinfo;
  81         enum x2apic_state x2apic_state;
  82         int             nmi_pending;
  83 };
  84 #define VCPU_F_PINNED   0x0001
  85
  86 #define VCPU_PINCPU(vm, vcpuid) \
  87     ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
  88
  89 #define VCPU_UNPIN(vm, vcpuid)  (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
  90
  91 #define VCPU_PIN(vm, vcpuid, host_cpuid)                                \
  92 do {                                                                    \
  93         vm->vcpu[vcpuid].flags |= VCPU_F_PINNED;                        \
  94         vm->vcpu[vcpuid].pincpu = host_cpuid;                           \
  95 } while(0)
  96
  97 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
  98 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
  99 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 100
 101 #define VM_MAX_MEMORY_SEGMENTS  2
 102
 103 struct vm {
 104         void            *cookie;        /* processor-specific data */
 105         void            *iommu;         /* iommu-specific data */
 106         struct vcpu     vcpu[VM_MAXCPU];
 107         int             num_mem_segs;
 108         struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
 109         char            name[VM_MAX_NAMELEN];
 110
 111         /*
 112          * Set of active vcpus.
 113          * An active vcpu is one that has been started implicitly (BSP) or
 114          * explicitly (AP) by sending it a startup ipi.
 115          */
 116         cpuset_t        active_cpus;
 117 };
 118
 119 static struct vmm_ops *ops;
 120 #define VMM_INIT()      (ops != NULL ? (*ops->init)() : 0)
 121 #define VMM_CLEANUP()   (ops != NULL ? (*ops->cleanup)() : 0)
 122
 123 #define VMINIT(vm)      (ops != NULL ? (*ops->vminit)(vm): NULL)
 124 #define VMRUN(vmi, vcpu, rip) \
 125         (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
 126 #define VMCLEANUP(vmi)  (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 127 #define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)                 \
 128         (ops != NULL ?                                                  \
 129         (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :       \
 130         ENXIO)
 131 #define VMMMAP_GET(vmi, gpa) \
 132         (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
 133 #define VMGETREG(vmi, vcpu, num, retval)                \
 134         (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 135 #define VMSETREG(vmi, vcpu, num, val)           \
 136         (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 137 #define VMGETDESC(vmi, vcpu, num, desc)         \
 138         (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 139 #define VMSETDESC(vmi, vcpu, num, desc)         \
 140         (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 141 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
 142         (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
 143 #define VMGETCAP(vmi, vcpu, num, retval)        \
 144         (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 145 #define VMSETCAP(vmi, vcpu, num, val)           \
 146         (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 147
 148 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 149 #define fpu_stop_emulating()    clts()
 150
 151 static MALLOC_DEFINE(M_VM, "vm", "vm");
 152 CTASSERT(VMM_MSR_NUM <= 64);    /* msr_mask can keep track of up to 64 msrs */
 153
 154 /* statistics */
 155 static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 156
 157 static void
 158 vcpu_cleanup(struct vcpu *vcpu)
 159 {
 160         vlapic_cleanup(vcpu->vlapic);
 161         vmm_stat_free(vcpu->stats);
 162         fpu_save_area_free(vcpu->guestfpu);
 163 }
 164
 165 static void
 166 vcpu_init(struct vm *vm, uint32_t vcpu_id)
 167 {
 168         struct vcpu *vcpu;
 169
 170         vcpu = &vm->vcpu[vcpu_id];
 171
 172         vcpu_lock_init(vcpu);
 173         vcpu->hostcpu = NOCPU;
 174         vcpu->vcpuid = vcpu_id;
 175         vcpu->vlapic = vlapic_init(vm, vcpu_id);
 176         vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
 177         vcpu->guestfpu = fpu_save_area_alloc();
 178         fpu_save_area_reset(vcpu->guestfpu);
 179         vcpu->stats = vmm_stat_alloc();
 180 }
 181
 182 struct vm_exit *
 183 vm_exitinfo(struct vm *vm, int cpuid)
 184 {
 185         struct vcpu *vcpu;
 186
 187         if (cpuid < 0 || cpuid >= VM_MAXCPU)
 188                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
 189
 190         vcpu = &vm->vcpu[cpuid];
 191
 192         return (&vcpu->exitinfo);
 193 }
 194
 195 static int
 196 vmm_init(void)
 197 {
 198         int error;
 199
 200         vmm_host_state_init();
 201         vmm_ipi_init();
 202
 203         error = vmm_mem_init();
 204         if (error)
 205                 return (error);
 206
 207         if (vmm_is_intel())
 208                 ops = &vmm_ops_intel;
 209         else if (vmm_is_amd())
 210                 ops = &vmm_ops_amd;
 211         else
 212                 return (ENXIO);
 213
 214         vmm_msr_init();
 215
 216         return (VMM_INIT());
 217 }
 218
 219 static int
 220 vmm_handler(module_t mod, int what, void *arg)
 221 {
 222         int error;
 223
 224         switch (what) {
 225         case MOD_LOAD:
 226                 vmmdev_init();
 227                 iommu_init();
 228                 error = vmm_init();
 229                 break;
 230         case MOD_UNLOAD:
 231                 error = vmmdev_cleanup();
 232                 if (error == 0) {
 233                         iommu_cleanup();
 234                         vmm_ipi_cleanup();
 235                         error = VMM_CLEANUP();
 236                 }
 237                 break;
 238         default:
 239                 error = 0;
 240                 break;
 241         }
 242         return (error);
 243 }
 244
 245 static moduledata_t vmm_kmod = {
 246         "vmm",
 247         vmm_handler,
 248         NULL
 249 };
 250
 251 /*
 252  * vmm initialization has the following dependencies:
 253  *
 254  * - iommu initialization must happen after the pci passthru driver has had
 255  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
 256  *
 257  * - VT-x initialization requires smp_rendezvous() and therefore must happen
 258  *   after SMP is fully functional (after SI_SUB_SMP).
 259  */
 260 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 261 MODULE_VERSION(vmm, 1);
 262
 263 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 264
 265 struct vm *
 266 vm_create(const char *name)
 267 {
 268         int i;
 269         struct vm *vm;
 270         vm_paddr_t maxaddr;
 271
 272         const int BSP = 0;
 273
 274         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 275                 return (NULL);
 276
 277         vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 278         strcpy(vm->name, name);
 279         vm->cookie = VMINIT(vm);
 280
 281         for (i = 0; i < VM_MAXCPU; i++) {
 282                 vcpu_init(vm, i);
 283                 guest_msrs_init(vm, i);
 284         }
 285
 286         maxaddr = vmm_mem_maxaddr();
 287         vm->iommu = iommu_create_domain(maxaddr);
 288         vm_activate_cpu(vm, BSP);
 289
 290         return (vm);
 291 }
 292
 293 static void
 294 vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
 295 {
 296         size_t len;
 297         vm_paddr_t hpa;
 298         void *host_domain;
 299
 300         host_domain = iommu_host_domain();
 301
 302         len = 0;
 303         while (len < seg->len) {
 304                 hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
 305                 if (hpa == (vm_paddr_t)-1) {
 306                         panic("vm_free_mem_segs: cannot free hpa "
 307                               "associated with gpa 0x%016lx", seg->gpa + len);
 308                 }
 309
 310                 /*
 311                  * Remove the 'gpa' to 'hpa' mapping in VMs domain.
 312                  * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
 313                  */
 314                 iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
 315                 iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
 316
 317                 vmm_mem_free(hpa, PAGE_SIZE);
 318
 319                 len += PAGE_SIZE;
 320         }
 321
 322         /*
 323          * Invalidate cached translations associated with 'vm->iommu' since
 324          * we have now moved some pages from it.
 325          */
 326         iommu_invalidate_tlb(vm->iommu);
 327
 328         bzero(seg, sizeof(struct vm_memory_segment));
 329 }
 330
 331 void
 332 vm_destroy(struct vm *vm)
 333 {
 334         int i;
 335
 336         ppt_unassign_all(vm);
 337
 338         for (i = 0; i < vm->num_mem_segs; i++)
 339                 vm_free_mem_seg(vm, &vm->mem_segs[i]);
 340
 341         vm->num_mem_segs = 0;
 342
 343         for (i = 0; i < VM_MAXCPU; i++)
 344                 vcpu_cleanup(&vm->vcpu[i]);
 345
 346         iommu_destroy_domain(vm->iommu);
 347
 348         VMCLEANUP(vm->cookie);
 349
 350         free(vm, M_VM);
 351 }
 352
 353 const char *
 354 vm_name(struct vm *vm)
 355 {
 356         return (vm->name);
 357 }
 358
 359 int
 360 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 361 {
 362         const boolean_t spok = TRUE;    /* superpage mappings are ok */
 363
 364         return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
 365                            VM_PROT_RW, spok));
 366 }
 367
 368 int
 369 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 370 {
 371         const boolean_t spok = TRUE;    /* superpage mappings are ok */
 372
 373         return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
 374                            VM_PROT_NONE, spok));
 375 }
 376
 377 /*
 378  * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
 379  */
 380 static boolean_t
 381 vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
 382 {
 383         int i;
 384         vm_paddr_t gpabase, gpalimit;
 385
 386         if (gpa & PAGE_MASK)
 387                 panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
 388
 389         for (i = 0; i < vm->num_mem_segs; i++) {
 390                 gpabase = vm->mem_segs[i].gpa;
 391                 gpalimit = gpabase + vm->mem_segs[i].len;
 392                 if (gpa >= gpabase && gpa < gpalimit)
 393                         return (FALSE);
 394         }
 395
 396         return (TRUE);
 397 }
 398
 399 int
 400 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 401 {
 402         int error, available, allocated;
 403         struct vm_memory_segment *seg;
 404         vm_paddr_t g, hpa;
 405         void *host_domain;
 406
 407         const boolean_t spok = TRUE;    /* superpage mappings are ok */
 408
 409         if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 410                 return (EINVAL);
 411
 412         available = allocated = 0;
 413         g = gpa;
 414         while (g < gpa + len) {
 415                 if (vm_gpa_available(vm, g))
 416                         available++;
 417                 else
 418                         allocated++;
 419
 420                 g += PAGE_SIZE;
 421         }
 422
 423         /*
 424          * If there are some allocated and some available pages in the address
 425          * range then it is an error.
 426          */
 427         if (allocated && available)
 428                 return (EINVAL);
 429
 430         /*
 431          * If the entire address range being requested has already been
 432          * allocated then there isn't anything more to do.
 433          */
 434         if (allocated && available == 0)
 435                 return (0);
 436
 437         if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 438                 return (E2BIG);
 439
 440         host_domain = iommu_host_domain();
 441
 442         seg = &vm->mem_segs[vm->num_mem_segs];
 443
 444         error = 0;
 445         seg->gpa = gpa;
 446         seg->len = 0;
 447         while (seg->len < len) {
 448                 hpa = vmm_mem_alloc(PAGE_SIZE);
 449                 if (hpa == 0) {
 450                         error = ENOMEM;
 451                         break;
 452                 }
 453
 454                 error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
 455                                    VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
 456                 if (error)
 457                         break;
 458
 459                 /*
 460                  * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
 461                  * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
 462                  */
 463                 iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
 464                 iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
 465
 466                 seg->len += PAGE_SIZE;
 467         }
 468
 469         if (error) {
 470                 vm_free_mem_seg(vm, seg);
 471                 return (error);
 472         }
 473
 474         /*
 475          * Invalidate cached translations associated with 'host_domain' since
 476          * we have now moved some pages from it.
 477          */
 478         iommu_invalidate_tlb(host_domain);
 479
 480         vm->num_mem_segs++;
 481
 482         return (0);
 483 }
 484
 485 vm_paddr_t
 486 vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
 487 {
 488         vm_paddr_t nextpage;
 489
 490         nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
 491         if (len > nextpage - gpa)
 492                 panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 493
 494         return (VMMMAP_GET(vm->cookie, gpa));
 495 }
 496
 497 int
 498 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 499                   struct vm_memory_segment *seg)
 500 {
 501         int i;
 502
 503         for (i = 0; i < vm->num_mem_segs; i++) {
 504                 if (gpabase == vm->mem_segs[i].gpa) {
 505                         *seg = vm->mem_segs[i];
 506                         return (0);
 507                 }
 508         }
 509         return (-1);
 510 }
 511
 512 int
 513 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 514 {
 515
 516         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 517                 return (EINVAL);
 518
 519         if (reg >= VM_REG_LAST)
 520                 return (EINVAL);
 521
 522         return (VMGETREG(vm->cookie, vcpu, reg, retval));
 523 }
 524
 525 int
 526 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
 527 {
 528
 529         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 530                 return (EINVAL);
 531
 532         if (reg >= VM_REG_LAST)
 533                 return (EINVAL);
 534
 535         return (VMSETREG(vm->cookie, vcpu, reg, val));
 536 }
 537
 538 static boolean_t
 539 is_descriptor_table(int reg)
 540 {
 541
 542         switch (reg) {
 543         case VM_REG_GUEST_IDTR:
 544         case VM_REG_GUEST_GDTR:
 545                 return (TRUE);
 546         default:
 547                 return (FALSE);
 548         }
 549 }
 550
 551 static boolean_t
 552 is_segment_register(int reg)
 553 {
 554
 555         switch (reg) {
 556         case VM_REG_GUEST_ES:
 557         case VM_REG_GUEST_CS:
 558         case VM_REG_GUEST_SS:
 559         case VM_REG_GUEST_DS:
 560         case VM_REG_GUEST_FS:
 561         case VM_REG_GUEST_GS:
 562         case VM_REG_GUEST_TR:
 563         case VM_REG_GUEST_LDTR:
 564                 return (TRUE);
 565         default:
 566                 return (FALSE);
 567         }
 568 }
 569
 570 int
 571 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 572                 struct seg_desc *desc)
 573 {
 574
 575         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 576                 return (EINVAL);
 577
 578         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 579                 return (EINVAL);
 580
 581         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 582 }
 583
 584 int
 585 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 586                 struct seg_desc *desc)
 587 {
 588         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 589                 return (EINVAL);
 590
 591         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 592                 return (EINVAL);
 593
 594         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 595 }
 596
 597 int
 598 vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
 599 {
 600
 601         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 602                 return (EINVAL);
 603
 604         *cpuid = VCPU_PINCPU(vm, vcpuid);
 605
 606         return (0);
 607 }
 608
 609 int
 610 vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
 611 {
 612         struct thread *td;
 613
 614         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 615                 return (EINVAL);
 616
 617         td = curthread;         /* XXXSMP only safe when muxing vcpus */
 618
 619         /* unpin */
 620         if (host_cpuid < 0) {
 621                 VCPU_UNPIN(vm, vcpuid);
 622                 thread_lock(td);
 623                 sched_unbind(td);
 624                 thread_unlock(td);
 625                 return (0);
 626         }
 627
 628         if (CPU_ABSENT(host_cpuid))
 629                 return (EINVAL);
 630
 631         /*
 632          * XXX we should check that 'host_cpuid' has not already been pinned
 633          * by another vm.
 634          */
 635         thread_lock(td);
 636         sched_bind(td, host_cpuid);
 637         thread_unlock(td);
 638         VCPU_PIN(vm, vcpuid, host_cpuid);
 639
 640         return (0);
 641 }
 642
 643 static void
 644 restore_guest_fpustate(struct vcpu *vcpu)
 645 {
 646
 647         /* flush host state to the pcb */
 648         fpuexit(curthread);
 649
 650         /* restore guest FPU state */
 651         fpu_stop_emulating();
 652         fpurestore(vcpu->guestfpu);
 653
 654         /*
 655          * The FPU is now "dirty" with the guest's state so turn on emulation
 656          * to trap any access to the FPU by the host.
 657          */
 658         fpu_start_emulating();
 659 }
 660
 661 static void
 662 save_guest_fpustate(struct vcpu *vcpu)
 663 {
 664
 665         if ((rcr0() & CR0_TS) == 0)
 666                 panic("fpu emulation not enabled in host!");
 667
 668         /* save guest FPU state */
 669         fpu_stop_emulating();
 670         fpusave(vcpu->guestfpu);
 671         fpu_start_emulating();
 672 }
 673
 674 static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 675
 676 int
 677 vm_run(struct vm *vm, struct vm_run *vmrun)
 678 {
 679         int error, vcpuid, sleepticks, t;
 680         struct vcpu *vcpu;
 681         struct pcb *pcb;
 682         uint64_t tscval, rip;
 683         struct vm_exit *vme;
 684
 685         vcpuid = vmrun->cpuid;
 686
 687         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 688                 return (EINVAL);
 689
 690         vcpu = &vm->vcpu[vcpuid];
 691         vme = &vmrun->vm_exit;
 692         rip = vmrun->rip;
 693 restart:
 694         critical_enter();
 695
 696         tscval = rdtsc();
 697
 698         pcb = PCPU_GET(curpcb);
 699         set_pcb_flags(pcb, PCB_FULL_IRET);
 700
 701         restore_guest_msrs(vm, vcpuid);
 702         restore_guest_fpustate(vcpu);
 703
 704         vcpu->hostcpu = curcpu;
 705         error = VMRUN(vm->cookie, vcpuid, rip);
 706         vcpu->hostcpu = NOCPU;
 707
 708         save_guest_fpustate(vcpu);
 709         restore_host_msrs(vm, vcpuid);
 710
 711         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 712
 713         /* copy the exit information */
 714         bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
 715
 716         critical_exit();
 717
 718         /*
 719          * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
 720          * is ready to run.
 721          */
 722         if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
 723                 vcpu_lock(vcpu);
 724
 725                 /*
 726                  * Figure out the number of host ticks until the next apic
 727                  * timer interrupt in the guest.
 728                  */
 729                 sleepticks = lapic_timer_tick(vm, vcpuid);
 730
 731                 /*
 732                  * If the guest local apic timer is disabled then sleep for
 733                  * a long time but not forever.
 734                  */
 735                 if (sleepticks < 0)
 736                         sleepticks = hz;
 737
 738                 /*
 739                  * Do a final check for pending NMI or interrupts before
 740                  * really putting this thread to sleep.
 741                  *
 742                  * These interrupts could have happened any time after we
 743                  * returned from VMRUN() and before we grabbed the vcpu lock.
 744                  */
 745                 if (!vm_nmi_pending(vm, vcpuid) &&
 746                     lapic_pending_intr(vm, vcpuid) < 0) {
 747                         if (sleepticks <= 0)
 748                                 panic("invalid sleepticks %d", sleepticks);
 749                         t = ticks;
 750                         msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
 751                         vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 752                 }
 753
 754                 vcpu_unlock(vcpu);
 755
 756                 rip = vme->rip + vme->inst_length;
 757                 goto restart;
 758         }
 759
 760         return (error);
 761 }
 762
 763 int
 764 vm_inject_event(struct vm *vm, int vcpuid, int type,
 765                 int vector, uint32_t code, int code_valid)
 766 {
 767         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 768                 return (EINVAL);
 769
 770         if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
 771                 return (EINVAL);
 772
 773         if (vector < 0 || vector > 255)
 774                 return (EINVAL);
 775
 776         return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
 777 }
 778
 779 static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 780
 781 int
 782 vm_inject_nmi(struct vm *vm, int vcpuid)
 783 {
 784         struct vcpu *vcpu;
 785
 786         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 787                 return (EINVAL);
 788
 789         vcpu = &vm->vcpu[vcpuid];
 790
 791         vcpu->nmi_pending = 1;
 792         vm_interrupt_hostcpu(vm, vcpuid);
 793         return (0);
 794 }
 795
 796 int
 797 vm_nmi_pending(struct vm *vm, int vcpuid)
 798 {
 799         struct vcpu *vcpu;
 800
 801         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 802                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 803
 804         vcpu = &vm->vcpu[vcpuid];
 805
 806         return (vcpu->nmi_pending);
 807 }
 808
 809 void
 810 vm_nmi_clear(struct vm *vm, int vcpuid)
 811 {
 812         struct vcpu *vcpu;
 813
 814         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 815                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 816
 817         vcpu = &vm->vcpu[vcpuid];
 818
 819         if (vcpu->nmi_pending == 0)
 820                 panic("vm_nmi_clear: inconsistent nmi_pending state");
 821
 822         vcpu->nmi_pending = 0;
 823         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 824 }
 825
 826 int
 827 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 828 {
 829         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 830                 return (EINVAL);
 831
 832         if (type < 0 || type >= VM_CAP_MAX)
 833                 return (EINVAL);
 834
 835         return (VMGETCAP(vm->cookie, vcpu, type, retval));
 836 }
 837
 838 int
 839 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 840 {
 841         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 842                 return (EINVAL);
 843
 844         if (type < 0 || type >= VM_CAP_MAX)
 845                 return (EINVAL);
 846
 847         return (VMSETCAP(vm->cookie, vcpu, type, val));
 848 }
 849
 850 uint64_t *
 851 vm_guest_msrs(struct vm *vm, int cpu)
 852 {
 853         return (vm->vcpu[cpu].guest_msrs);
 854 }
 855
 856 struct vlapic *
 857 vm_lapic(struct vm *vm, int cpu)
 858 {
 859         return (vm->vcpu[cpu].vlapic);
 860 }
 861
 862 boolean_t
 863 vmm_is_pptdev(int bus, int slot, int func)
 864 {
 865         int found, i, n;
 866         int b, s, f;
 867         char *val, *cp, *cp2;
 868
 869         /*
 870          * XXX
 871          * The length of an environment variable is limited to 128 bytes which
 872          * puts an upper limit on the number of passthru devices that may be
 873          * specified using a single environment variable.
 874          *
 875          * Work around this by scanning multiple environment variable
 876          * names instead of a single one - yuck!
 877          */
 878         const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 879
 880         /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 881         found = 0;
 882         for (i = 0; names[i] != NULL && !found; i++) {
 883                 cp = val = getenv(names[i]);
 884                 while (cp != NULL && *cp != '\0') {
 885                         if ((cp2 = strchr(cp, ' ')) != NULL)
 886                                 *cp2 = '\0';
 887
 888                         n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 889                         if (n == 3 && bus == b && slot == s && func == f) {
 890                                 found = 1;
 891                                 break;
 892                         }
 893
 894                         if (cp2 != NULL)
 895                                 *cp2++ = ' ';
 896
 897                         cp = cp2;
 898                 }
 899                 freeenv(val);
 900         }
 901         return (found);
 902 }
 903
 904 void *
 905 vm_iommu_domain(struct vm *vm)
 906 {
 907
 908         return (vm->iommu);
 909 }
 910
 911 int
 912 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
 913 {
 914         int error;
 915         struct vcpu *vcpu;
 916
 917         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 918                 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 919
 920         vcpu = &vm->vcpu[vcpuid];
 921
 922         vcpu_lock(vcpu);
 923
 924         /*
 925          * The following state transitions are allowed:
 926          * IDLE -> RUNNING -> IDLE
 927          * IDLE -> CANNOT_RUN -> IDLE
 928          */
 929         if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
 930             (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
 931                 error = 0;
 932                 vcpu->state = state;
 933         } else {
 934                 error = EBUSY;
 935         }
 936
 937         vcpu_unlock(vcpu);
 938
 939         return (error);
 940 }
 941
 942 enum vcpu_state
 943 vcpu_get_state(struct vm *vm, int vcpuid)
 944 {
 945         struct vcpu *vcpu;
 946         enum vcpu_state state;
 947
 948         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 949                 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 950
 951         vcpu = &vm->vcpu[vcpuid];
 952
 953         vcpu_lock(vcpu);
 954         state = vcpu->state;
 955         vcpu_unlock(vcpu);
 956
 957         return (state);
 958 }
 959
 960 void
 961 vm_activate_cpu(struct vm *vm, int vcpuid)
 962 {
 963
 964         if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
 965                 CPU_SET(vcpuid, &vm->active_cpus);
 966 }
 967
 968 cpuset_t
 969 vm_active_cpus(struct vm *vm)
 970 {
 971
 972         return (vm->active_cpus);
 973 }
 974
 975 void *
 976 vcpu_stats(struct vm *vm, int vcpuid)
 977 {
 978
 979         return (vm->vcpu[vcpuid].stats);
 980 }
 981
 982 int
 983 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 984 {
 985         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 986                 return (EINVAL);
 987
 988         *state = vm->vcpu[vcpuid].x2apic_state;
 989
 990         return (0);
 991 }
 992
 993 int
 994 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 995 {
 996         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 997                 return (EINVAL);
 998
 999         if (state < 0 || state >= X2APIC_STATE_LAST)
1000                 return (EINVAL);
1001
1002         vm->vcpu[vcpuid].x2apic_state = state;
1003
1004         vlapic_set_x2apic_state(vm, vcpuid, state);
1005
1006         return (0);
1007 }
1008
1009 void
1010 vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
1011 {
1012         int hostcpu;
1013         struct vcpu *vcpu;
1014
1015         vcpu = &vm->vcpu[vcpuid];
1016
1017         vcpu_lock(vcpu);
1018         hostcpu = vcpu->hostcpu;
1019         if (hostcpu == NOCPU) {
1020                 /*
1021                  * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
1022                  * the host thread must be sleeping waiting for an event to
1023                  * kick the vcpu out of 'hlt'.
1024                  *
1025                  * XXX this is racy because the condition exists right before
1026                  * and after calling VMRUN() in vm_run(). The wakeup() is
1027                  * benign in this case.
1028                  */
1029                 if (vcpu->state == VCPU_RUNNING)
1030                         wakeup_one(vcpu);
1031         } else {
1032                 if (vcpu->state != VCPU_RUNNING)
1033                         panic("invalid vcpu state %d", vcpu->state);
1034                 if (hostcpu != curcpu)
1035                         ipi_cpu(hostcpu, vmm_ipinum);
1036         }
1037         vcpu_unlock(vcpu);
1038 }