2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2011 NetApp, Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/types.h>
30 #ifndef WITHOUT_CAPSICUM
31 #include <sys/capsicum.h>
35 #include <sys/socket.h>
43 #include <machine/atomic.h>
45 #ifndef WITHOUT_CAPSICUM
46 #include <capsicum_helpers.h>
60 #include <pthread_np.h>
85 #include "amd64/pci_lpc.h"
87 #include "qemu_fwcfg.h"
91 #include "tpm_device.h"
95 #define MB (1024UL * 1024)
96 #define GB (1024UL * MB)
99 uint16_t cpu_cores, cpu_sockets, cpu_threads;
103 static char *progname;
104 static const int BSP = 0;
106 static cpuset_t cpumask;
108 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu);
110 static struct vcpu_info {
116 static cpuset_t **vcpumap;
123 "Usage: %s [-AaCDeHhPSuWwxY]\n"
124 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
125 " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n"
126 " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n"
127 " -A: create ACPI tables\n"
128 " -a: local apic is in xAPIC mode (deprecated)\n"
129 " -C: include guest memory in core file\n"
130 " -c: number of CPUs and/or topology specification\n"
131 " -D: destroy on power-off\n"
132 " -e: exit on unhandled I/O access\n"
133 " -G: start a debug server\n"
134 " -H: vmexit from the guest on HLT\n"
136 " -k: key=value flat config file\n"
137 " -K: PS2 keyboard layout\n"
138 " -l: LPC device configuration\n"
140 " -o: set config 'var' to 'value'\n"
141 " -P: vmexit from the guest on pause\n"
142 " -p: pin 'vcpu' to 'hostcpu'\n"
143 #ifdef BHYVE_SNAPSHOT
144 " -r: path to checkpoint file\n"
146 " -S: guest memory cannot be swapped\n"
147 " -s: <slot,driver,configinfo> PCI slot config\n"
149 " -u: RTC keeps UTC time\n"
150 " -W: force virtio to use single-vector MSI\n"
151 " -w: ignore unimplemented MSRs\n"
152 " -x: local APIC is in x2APIC mode\n"
153 " -Y: disable MPtable generation\n",
154 progname, (int)strlen(progname), "", (int)strlen(progname), "",
155 (int)strlen(progname), "");
161 * XXX This parser is known to have the following issues:
162 * 1. It accepts null key=value tokens ",," as setting "cpus" to an
165 * The acceptance of a null specification ('-c ""') is by design to match the
166 * manual page syntax specification, this results in a topology of 1 vCPU.
169 topology_parse(const char *opt)
171 char *cp, *str, *tofree;
174 set_config_value("sockets", "1");
175 set_config_value("cores", "1");
176 set_config_value("threads", "1");
177 set_config_value("cpus", "1");
181 tofree = str = strdup(opt);
183 errx(4, "Failed to allocate memory");
185 while ((cp = strsep(&str, ",")) != NULL) {
186 if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
187 set_config_value("cpus", cp + strlen("cpus="));
188 else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
189 set_config_value("sockets", cp + strlen("sockets="));
190 else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
191 set_config_value("cores", cp + strlen("cores="));
192 else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
193 set_config_value("threads", cp + strlen("threads="));
194 else if (strchr(cp, '=') != NULL)
197 set_config_value("cpus", cp);
208 parse_int_value(const char *key, const char *value, int minval, int maxval)
214 lval = strtol(value, &cp, 0);
215 if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
217 errx(4, "Invalid value for %s: '%s'", key, value);
222 * Set the sockets, cores, threads, and guest_cpus variables based on
223 * the configured topology.
225 * The limits of UINT16_MAX are due to the types passed to
226 * vm_set_topology(). vmm.ko may enforce tighter limits.
235 value = get_config_value("cpus");
237 guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
238 explicit_cpus = true;
241 explicit_cpus = false;
243 value = get_config_value("cores");
245 cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX);
248 value = get_config_value("threads");
250 cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX);
253 value = get_config_value("sockets");
255 cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
257 cpu_sockets = guest_ncpus;
260 * Compute sockets * cores * threads avoiding overflow. The
261 * range check above insures these are 16 bit values.
263 ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads;
264 if (ncpus > UINT16_MAX)
265 errx(4, "Computed number of vCPUs too high: %ju",
269 if (guest_ncpus != (int)ncpus)
270 errx(4, "Topology (%d sockets, %d cores, %d threads) "
271 "does not match %d vCPUs",
272 cpu_sockets, cpu_cores, cpu_threads,
279 pincpu_parse(const char *opt)
286 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
287 fprintf(stderr, "invalid format: %s\n", opt);
292 fprintf(stderr, "invalid vcpu '%d'\n", vcpu);
296 if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
297 fprintf(stderr, "hostcpu '%d' outside valid range from "
298 "0 to %d\n", pcpu, CPU_SETSIZE - 1);
302 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
303 value = get_config_value(key);
305 if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
306 value != NULL ? "," : "", pcpu) == -1) {
307 perror("failed to build new cpuset string");
311 set_config_value(key, newval);
317 parse_cpuset(int vcpu, const char *list, cpuset_t *set)
324 token = __DECONST(char *, list);
326 pcpu = strtoul(token, &cp, 0);
328 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
329 if (pcpu < 0 || pcpu >= CPU_SETSIZE)
330 errx(4, "hostcpu '%d' outside valid range from 0 to %d",
331 pcpu, CPU_SETSIZE - 1);
337 errx(4, "Invalid hostcpu range %d-%d",
339 while (start < pcpu) {
349 errx(4, "invalid cpuset for vcpu %d: '%s'",
354 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
369 vcpumap = calloc(guest_ncpus, sizeof(*vcpumap));
370 for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
371 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
372 value = get_config_value(key);
375 vcpumap[vcpu] = malloc(sizeof(cpuset_t));
376 if (vcpumap[vcpu] == NULL)
377 err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
378 parse_cpuset(vcpu, value, vcpumap[vcpu]);
383 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
386 return (vm_map_gpa(ctx, gaddr, len));
389 #ifdef BHYVE_SNAPSHOT
391 paddr_host2guest(struct vmctx *ctx, void *addr)
393 return (vm_rev_map_gpa(ctx, addr));
398 fbsdrun_virtio_msix(void)
401 return (get_config_bool_default("virtio_msix", true));
405 fbsdrun_vcpu(int vcpuid)
407 return (vcpu_info[vcpuid].vcpu);
411 fbsdrun_start_thread(void *param)
413 char tname[MAXCOMLEN + 1];
414 struct vcpu_info *vi = param;
417 snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid);
418 pthread_set_name_np(pthread_self(), tname);
420 if (vcpumap[vi->vcpuid] != NULL) {
421 error = pthread_setaffinity_np(pthread_self(),
422 sizeof(cpuset_t), vcpumap[vi->vcpuid]);
426 #ifdef BHYVE_SNAPSHOT
427 checkpoint_cpu_add(vi->vcpuid);
430 gdb_cpu_add(vi->vcpu);
433 vm_loop(vi->ctx, vi->vcpu);
441 fbsdrun_addcpu(int vcpuid)
443 struct vcpu_info *vi;
447 vi = &vcpu_info[vcpuid];
449 error = vm_activate_cpu(vi->vcpu);
451 err(EX_OSERR, "could not activate CPU %d", vi->vcpuid);
453 CPU_SET_ATOMIC(vcpuid, &cpumask);
455 vm_suspend_cpu(vi->vcpu);
457 error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi);
462 fbsdrun_deletecpu(int vcpu)
464 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
465 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
467 pthread_mutex_lock(&resetcpu_mtx);
468 if (!CPU_ISSET(vcpu, &cpumask)) {
469 EPRINTLN("Attempting to delete unknown cpu %d", vcpu);
473 CPU_CLR(vcpu, &cpumask);
476 pthread_cond_signal(&resetcpu_cond);
477 pthread_mutex_unlock(&resetcpu_mtx);
482 while (!CPU_EMPTY(&cpumask)) {
483 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
485 pthread_mutex_unlock(&resetcpu_mtx);
489 fbsdrun_suspendcpu(int vcpuid)
491 return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu));
495 vm_loop(struct vmctx *ctx, struct vcpu *vcpu)
500 enum vm_exitcode exitcode;
501 cpuset_t active_cpus, dmask;
503 error = vm_active_cpus(ctx, &active_cpus);
504 assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus));
506 vmrun.vm_exit = &vme;
507 vmrun.cpuset = &dmask;
508 vmrun.cpusetsize = sizeof(dmask);
511 error = vm_run(vcpu, &vmrun);
515 exitcode = vme.exitcode;
516 if (exitcode >= VM_EXITCODE_MAX ||
517 vmexit_handlers[exitcode] == NULL) {
518 warnx("vm_loop: unexpected exitcode 0x%x", exitcode);
522 rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun);
525 case VMEXIT_CONTINUE:
533 EPRINTLN("vm_run error %d, errno %d", error, errno);
537 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu)
539 uint16_t sockets, cores, threads, maxcpus;
543 * The guest is allowed to spinup more than one processor only if the
544 * UNRESTRICTED_GUEST capability is available.
546 error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp);
550 error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus);
557 static struct vmctx *
558 do_open(const char *vmname)
562 bool reinit, romboot;
564 reinit = romboot = false;
571 error = vm_create(vmname);
573 if (errno == EEXIST) {
578 * The virtual machine has been setup by the
579 * userspace bootloader.
589 * If the virtual machine was just created then a
590 * bootrom must be configured to boot it.
592 fprintf(stderr, "virtual machine cannot be booted\n");
597 ctx = vm_open(vmname);
603 #ifndef WITHOUT_CAPSICUM
604 if (vm_limit_rights(ctx) != 0)
605 err(EX_OSERR, "vm_limit_rights");
609 error = vm_reinit(ctx);
615 error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0);
617 errx(EX_OSERR, "vm_set_topology");
622 parse_config_option(const char *option)
627 value = strchr(option, '=');
628 if (value == NULL || value[1] == '\0')
630 path = strndup(option, value - option);
632 err(4, "Failed to allocate memory");
633 set_config_value(path, value + 1);
638 parse_simple_config_file(const char *path)
645 fp = fopen(path, "r");
647 err(4, "Failed to open configuration file %s", path);
651 for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
652 if (*line == '#' || *line == '\n')
654 cp = strchr(line, '\n');
657 if (!parse_config_option(line))
658 errx(4, "%s line %u: invalid config option '%s'", path,
667 parse_gdb_options(const char *opt)
673 set_config_bool("gdb.wait", true);
677 colon = strrchr(opt, ':');
684 set_config_value("gdb.address", opt);
687 set_config_value("gdb.port", sport);
692 main(int argc, char *argv[])
695 int max_vcpus, memflags;
699 const char *optstr, *value, *vmname;
700 #ifdef BHYVE_SNAPSHOT
702 struct restore_state rstate;
709 progname = basename(argv[0]);
711 #ifdef BHYVE_SNAPSHOT
712 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:";
714 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:";
716 while ((c = getopt(argc, argv, optstr)) != -1) {
720 set_config_bool("x86.x2apic", false);
725 * NOP. For backward compatibility. Most systems don't
726 * work properly without sane ACPI tables. Therefore,
727 * we're always generating them.
731 set_config_bool("destroy_on_poweroff", true);
734 if (pincpu_parse(optarg) != 0) {
735 errx(EX_USAGE, "invalid vcpu pinning "
736 "configuration '%s'", optarg);
740 if (topology_parse(optarg) != 0) {
741 errx(EX_USAGE, "invalid cpu topology "
746 set_config_bool("memory.guest_in_core", true);
749 if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) {
750 errx(EX_USAGE, "invalid fwcfg item '%s'", optarg);
755 parse_gdb_options(optarg);
759 parse_simple_config_file(optarg);
762 set_config_value("keyboard.layout", optarg);
766 if (strncmp(optarg, "help", strlen(optarg)) == 0) {
767 lpc_print_supported_devices();
769 } else if (lpc_device_parse(optarg) != 0) {
770 errx(EX_USAGE, "invalid lpc device "
771 "configuration '%s'", optarg);
775 #ifdef BHYVE_SNAPSHOT
777 restore_file = optarg;
781 if (strncmp(optarg, "help", strlen(optarg)) == 0) {
782 pci_print_supported_devices();
784 } else if (pci_parse_slot(optarg) != 0)
789 set_config_bool("memory.wired", true);
792 set_config_value("memory.size", optarg);
795 if (!parse_config_option(optarg))
796 errx(EX_USAGE, "invalid configuration option '%s'", optarg);
800 set_config_bool("x86.vmexit_on_hlt", true);
804 * The "-I" option was used to add an ioapic to the
807 * An ioapic is now provided unconditionally for each
808 * virtual machine and this option is now deprecated.
812 set_config_bool("x86.vmexit_on_pause", true);
815 set_config_bool("x86.strictio", true);
818 set_config_bool("rtc.use_localtime", false);
822 set_config_value("uuid", optarg);
826 set_config_bool("x86.strictmsr", false);
830 set_config_bool("virtio_msix", false);
834 set_config_bool("x86.x2apic", true);
837 set_config_bool("x86.mptable", false);
852 #ifdef BHYVE_SNAPSHOT
853 if (restore_file != NULL) {
854 error = load_restore_file(restore_file, &rstate);
856 fprintf(stderr, "Failed to read checkpoint info from "
857 "file: '%s'.\n", restore_file);
860 vmname = lookup_vmname(&rstate);
862 set_config_value("name", vmname);
867 set_config_value("name", argv[0]);
869 vmname = get_config_value("name");
873 if (get_config_bool_default("config.dump", false)) {
881 value = get_config_value("memory.size");
882 error = vm_parse_memsize(value, &memsize);
884 errx(EX_USAGE, "invalid memsize '%s'", value);
886 ctx = do_open(vmname);
888 #ifdef BHYVE_SNAPSHOT
889 if (restore_file != NULL) {
890 guest_ncpus = lookup_guest_ncpus(&rstate);
891 memflags = lookup_memflags(&rstate);
892 memsize = lookup_memsize(&rstate);
895 if (guest_ncpus < 1) {
896 fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
901 bsp = vm_vcpu_open(ctx, BSP);
902 max_vcpus = num_vcpus_allowed(ctx, bsp);
903 if (guest_ncpus > max_vcpus) {
904 fprintf(stderr, "%d vCPUs requested but only %d available\n",
905 guest_ncpus, max_vcpus);
909 bhyve_init_vcpu(bsp);
911 /* Allocate per-VCPU resources. */
912 vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info));
913 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) {
914 vcpu_info[vcpuid].ctx = ctx;
915 vcpu_info[vcpuid].vcpuid = vcpuid;
917 vcpu_info[vcpuid].vcpu = bsp;
919 vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid);
923 if (get_config_bool_default("memory.wired", false))
924 memflags |= VM_MEM_F_WIRED;
925 if (get_config_bool_default("memory.guest_in_core", false))
926 memflags |= VM_MEM_F_INCORE;
927 vm_set_memflags(ctx, memflags);
928 error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
930 fprintf(stderr, "Unable to setup memory (%d)\n", errno);
934 init_mem(guest_ncpus);
936 if (bhyve_init_platform(ctx, bsp) != 0)
939 if (qemu_fwcfg_init(ctx) != 0) {
940 fprintf(stderr, "qemu fwcfg initialization error\n");
944 if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus),
945 &guest_ncpus) != 0) {
946 fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n");
951 * Exit if a device emulation finds an error in its initialization
953 if (init_pci(ctx) != 0) {
954 EPRINTLN("Device emulation initialization error: %s",
958 if (init_tpm(ctx) != 0) {
959 EPRINTLN("Failed to init TPM device");
964 * Initialize after PCI, to allow a bootrom file to reserve the high
967 if (get_config_bool("acpi_tables"))
977 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
978 bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP);
980 #ifdef BHYVE_SNAPSHOT
981 if (restore_file != NULL) {
982 FPRINTLN(stdout, "Pausing pci devs...");
983 if (vm_pause_devices() != 0) {
984 EPRINTLN("Failed to pause PCI device state.");
988 FPRINTLN(stdout, "Restoring vm mem...");
989 if (restore_vm_mem(ctx, &rstate) != 0) {
990 EPRINTLN("Failed to restore VM memory.");
994 FPRINTLN(stdout, "Restoring pci devs...");
995 if (vm_restore_devices(&rstate) != 0) {
996 EPRINTLN("Failed to restore PCI device state.");
1000 FPRINTLN(stdout, "Restoring kernel structs...");
1001 if (vm_restore_kern_structs(ctx, &rstate) != 0) {
1002 EPRINTLN("Failed to restore kernel structs.");
1006 FPRINTLN(stdout, "Resuming pci devs...");
1007 if (vm_resume_devices() != 0) {
1008 EPRINTLN("Failed to resume PCI device state.");
1014 if (bhyve_init_platform_late(ctx, bsp) != 0)
1018 * Change the proc title to include the VM name.
1020 setproctitle("%s", vmname);
1022 #ifdef BHYVE_SNAPSHOT
1024 * checkpointing thread for communication with bhyvectl
1026 if (init_checkpoint_thread(ctx) != 0)
1027 errx(EX_OSERR, "Failed to start checkpoint thread");
1030 #ifndef WITHOUT_CAPSICUM
1031 caph_cache_catpages();
1033 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1034 errx(EX_OSERR, "Unable to apply rights for sandbox");
1036 if (caph_enter() == -1)
1037 errx(EX_OSERR, "cap_enter() failed");
1040 #ifdef BHYVE_SNAPSHOT
1041 if (restore_file != NULL) {
1042 destroy_restore_state(&rstate);
1043 if (vm_restore_time(ctx) < 0)
1044 err(EX_OSERR, "Unable to restore time");
1046 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
1047 vm_resume_cpu(vcpu_info[vcpuid].vcpu);
1053 * Head off to the main event dispatch loop