]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
Initial support for bhyve save and restore.
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include "opt_bhyve_snapshot.h"
35
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/jail.h>
39 #include <sys/queue.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/malloc.h>
43 #include <sys/conf.h>
44 #include <sys/sysctl.h>
45 #include <sys/libkern.h>
46 #include <sys/ioccom.h>
47 #include <sys/mman.h>
48 #include <sys/uio.h>
49 #include <sys/proc.h>
50
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55
56 #include <machine/vmparam.h>
57 #include <machine/vmm.h>
58 #include <machine/vmm_dev.h>
59 #include <machine/vmm_instruction_emul.h>
60 #include <machine/vmm_snapshot.h>
61
62 #include "vmm_lapic.h"
63 #include "vmm_stat.h"
64 #include "vmm_mem.h"
65 #include "io/ppt.h"
66 #include "io/vatpic.h"
67 #include "io/vioapic.h"
68 #include "io/vhpet.h"
69 #include "io/vrtc.h"
70
71 struct devmem_softc {
72         int     segid;
73         char    *name;
74         struct cdev *cdev;
75         struct vmmdev_softc *sc;
76         SLIST_ENTRY(devmem_softc) link;
77 };
78
79 struct vmmdev_softc {
80         struct vm       *vm;            /* vm instance cookie */
81         struct cdev     *cdev;
82         SLIST_ENTRY(vmmdev_softc) link;
83         SLIST_HEAD(, devmem_softc) devmem;
84         int             flags;
85 };
86 #define VSC_LINKED              0x01
87
88 static SLIST_HEAD(, vmmdev_softc) head;
89
90 static unsigned pr_allow_flag;
91 static struct mtx vmmdev_mtx;
92
93 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
94
95 SYSCTL_DECL(_hw_vmm);
96
97 static int vmm_priv_check(struct ucred *ucred);
98 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
99 static void devmem_destroy(void *arg);
100
101 static int
102 vmm_priv_check(struct ucred *ucred)
103 {
104
105         if (jailed(ucred) &&
106             !(ucred->cr_prison->pr_allow & pr_allow_flag))
107                 return (EPERM);
108
109         return (0);
110 }
111
112 static int
113 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
114 {
115         int error;
116
117         if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
118                 return (EINVAL);
119
120         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
121         return (error);
122 }
123
124 static void
125 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
126 {
127         enum vcpu_state state;
128
129         state = vcpu_get_state(sc->vm, vcpu, NULL);
130         if (state != VCPU_FROZEN) {
131                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
132                     vcpu, state);
133         }
134
135         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
136 }
137
138 static int
139 vcpu_lock_all(struct vmmdev_softc *sc)
140 {
141         int error, vcpu;
142         uint16_t maxcpus;
143
144         maxcpus = vm_get_maxcpus(sc->vm);
145         for (vcpu = 0; vcpu < maxcpus; vcpu++) {
146                 error = vcpu_lock_one(sc, vcpu);
147                 if (error)
148                         break;
149         }
150
151         if (error) {
152                 while (--vcpu >= 0)
153                         vcpu_unlock_one(sc, vcpu);
154         }
155
156         return (error);
157 }
158
159 static void
160 vcpu_unlock_all(struct vmmdev_softc *sc)
161 {
162         int vcpu;
163         uint16_t maxcpus;
164
165         maxcpus = vm_get_maxcpus(sc->vm);
166         for (vcpu = 0; vcpu < maxcpus; vcpu++)
167                 vcpu_unlock_one(sc, vcpu);
168 }
169
170 static struct vmmdev_softc *
171 vmmdev_lookup(const char *name)
172 {
173         struct vmmdev_softc *sc;
174
175 #ifdef notyet   /* XXX kernel is not compiled with invariants */
176         mtx_assert(&vmmdev_mtx, MA_OWNED);
177 #endif
178
179         SLIST_FOREACH(sc, &head, link) {
180                 if (strcmp(name, vm_name(sc->vm)) == 0)
181                         break;
182         }
183
184         return (sc);
185 }
186
187 static struct vmmdev_softc *
188 vmmdev_lookup2(struct cdev *cdev)
189 {
190
191         return (cdev->si_drv1);
192 }
193
194 static int
195 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
196 {
197         int error, off, c, prot;
198         vm_paddr_t gpa, maxaddr;
199         void *hpa, *cookie;
200         struct vmmdev_softc *sc;
201         uint16_t lastcpu;
202
203         error = vmm_priv_check(curthread->td_ucred);
204         if (error)
205                 return (error);
206
207         sc = vmmdev_lookup2(cdev);
208         if (sc == NULL)
209                 return (ENXIO);
210
211         /*
212          * Get a read lock on the guest memory map by freezing any vcpu.
213          */
214         lastcpu = vm_get_maxcpus(sc->vm) - 1;
215         error = vcpu_lock_one(sc, lastcpu);
216         if (error)
217                 return (error);
218
219         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
220         maxaddr = vmm_sysmem_maxaddr(sc->vm);
221         while (uio->uio_resid > 0 && error == 0) {
222                 gpa = uio->uio_offset;
223                 off = gpa & PAGE_MASK;
224                 c = min(uio->uio_resid, PAGE_SIZE - off);
225
226                 /*
227                  * The VM has a hole in its physical memory map. If we want to
228                  * use 'dd' to inspect memory beyond the hole we need to
229                  * provide bogus data for memory that lies in the hole.
230                  *
231                  * Since this device does not support lseek(2), dd(1) will
232                  * read(2) blocks of data to simulate the lseek(2).
233                  */
234                 hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
235                     prot, &cookie);
236                 if (hpa == NULL) {
237                         if (uio->uio_rw == UIO_READ && gpa < maxaddr)
238                                 error = uiomove(__DECONST(void *, zero_region),
239                                     c, uio);
240                         else
241                                 error = EFAULT;
242                 } else {
243                         error = uiomove(hpa, c, uio);
244                         vm_gpa_release(cookie);
245                 }
246         }
247         vcpu_unlock_one(sc, lastcpu);
248         return (error);
249 }
250
251 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
252
253 static int
254 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
255 {
256         struct devmem_softc *dsc;
257         int error;
258         bool sysmem;
259
260         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
261         if (error || mseg->len == 0)
262                 return (error);
263
264         if (!sysmem) {
265                 SLIST_FOREACH(dsc, &sc->devmem, link) {
266                         if (dsc->segid == mseg->segid)
267                                 break;
268                 }
269                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
270                     __func__, mseg->segid));
271                 error = copystr(dsc->name, mseg->name, sizeof(mseg->name),
272                     NULL);
273         } else {
274                 bzero(mseg->name, sizeof(mseg->name));
275         }
276
277         return (error);
278 }
279
280 static int
281 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
282 {
283         char *name;
284         int error;
285         bool sysmem;
286
287         error = 0;
288         name = NULL;
289         sysmem = true;
290
291         /*
292          * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
293          * by stripped off when devfs processes the full string.
294          */
295         if (VM_MEMSEG_NAME(mseg)) {
296                 sysmem = false;
297                 name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK);
298                 error = copystr(mseg->name, name, sizeof(mseg->name), NULL);
299                 if (error)
300                         goto done;
301         }
302
303         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
304         if (error)
305                 goto done;
306
307         if (VM_MEMSEG_NAME(mseg)) {
308                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
309                 if (error)
310                         vm_free_memseg(sc->vm, mseg->segid);
311                 else
312                         name = NULL;    /* freed when 'cdev' is destroyed */
313         }
314 done:
315         free(name, M_VMMDEV);
316         return (error);
317 }
318
319 static int
320 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
321     uint64_t *regval)
322 {
323         int error, i;
324
325         error = 0;
326         for (i = 0; i < count; i++) {
327                 error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
328                 if (error)
329                         break;
330         }
331         return (error);
332 }
333
334 static int
335 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
336     uint64_t *regval)
337 {
338         int error, i;
339
340         error = 0;
341         for (i = 0; i < count; i++) {
342                 error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
343                 if (error)
344                         break;
345         }
346         return (error);
347 }
348
349 static int
350 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
351              struct thread *td)
352 {
353         int error, vcpu, state_changed, size;
354         cpuset_t *cpuset;
355         struct vmmdev_softc *sc;
356         struct vm_register *vmreg;
357         struct vm_seg_desc *vmsegdesc;
358         struct vm_register_set *vmregset;
359         struct vm_run *vmrun;
360         struct vm_exception *vmexc;
361         struct vm_lapic_irq *vmirq;
362         struct vm_lapic_msi *vmmsi;
363         struct vm_ioapic_irq *ioapic_irq;
364         struct vm_isa_irq *isa_irq;
365         struct vm_isa_irq_trigger *isa_irq_trigger;
366         struct vm_capability *vmcap;
367         struct vm_pptdev *pptdev;
368         struct vm_pptdev_mmio *pptmmio;
369         struct vm_pptdev_msi *pptmsi;
370         struct vm_pptdev_msix *pptmsix;
371         struct vm_nmi *vmnmi;
372         struct vm_stats *vmstats;
373         struct vm_stat_desc *statdesc;
374         struct vm_x2apic *x2apic;
375         struct vm_gpa_pte *gpapte;
376         struct vm_suspend *vmsuspend;
377         struct vm_gla2gpa *gg;
378         struct vm_activate_cpu *vac;
379         struct vm_cpuset *vm_cpuset;
380         struct vm_intinfo *vmii;
381         struct vm_rtc_time *rtctime;
382         struct vm_rtc_data *rtcdata;
383         struct vm_memmap *mm;
384         struct vm_cpu_topology *topology;
385         uint64_t *regvals;
386         int *regnums;
387 #ifdef BHYVE_SNAPSHOT
388         struct vm_snapshot_meta *snapshot_meta;
389 #endif
390
391         error = vmm_priv_check(curthread->td_ucred);
392         if (error)
393                 return (error);
394
395         sc = vmmdev_lookup2(cdev);
396         if (sc == NULL)
397                 return (ENXIO);
398
399         vcpu = -1;
400         state_changed = 0;
401
402         /*
403          * Some VMM ioctls can operate only on vcpus that are not running.
404          */
405         switch (cmd) {
406         case VM_RUN:
407         case VM_GET_REGISTER:
408         case VM_SET_REGISTER:
409         case VM_GET_SEGMENT_DESCRIPTOR:
410         case VM_SET_SEGMENT_DESCRIPTOR:
411         case VM_GET_REGISTER_SET:
412         case VM_SET_REGISTER_SET:
413         case VM_INJECT_EXCEPTION:
414         case VM_GET_CAPABILITY:
415         case VM_SET_CAPABILITY:
416         case VM_PPTDEV_MSI:
417         case VM_PPTDEV_MSIX:
418         case VM_SET_X2APIC_STATE:
419         case VM_GLA2GPA:
420         case VM_GLA2GPA_NOFAULT:
421         case VM_ACTIVATE_CPU:
422         case VM_SET_INTINFO:
423         case VM_GET_INTINFO:
424         case VM_RESTART_INSTRUCTION:
425                 /*
426                  * XXX fragile, handle with care
427                  * Assumes that the first field of the ioctl data is the vcpu.
428                  */
429                 vcpu = *(int *)data;
430                 error = vcpu_lock_one(sc, vcpu);
431                 if (error)
432                         goto done;
433                 state_changed = 1;
434                 break;
435
436         case VM_MAP_PPTDEV_MMIO:
437         case VM_BIND_PPTDEV:
438         case VM_UNBIND_PPTDEV:
439         case VM_ALLOC_MEMSEG:
440         case VM_MMAP_MEMSEG:
441         case VM_REINIT:
442                 /*
443                  * ioctls that operate on the entire virtual machine must
444                  * prevent all vcpus from running.
445                  */
446                 error = vcpu_lock_all(sc);
447                 if (error)
448                         goto done;
449                 state_changed = 2;
450                 break;
451
452         case VM_GET_MEMSEG:
453         case VM_MMAP_GETNEXT:
454                 /*
455                  * Lock a vcpu to make sure that the memory map cannot be
456                  * modified while it is being inspected.
457                  */
458                 vcpu = vm_get_maxcpus(sc->vm) - 1;
459                 error = vcpu_lock_one(sc, vcpu);
460                 if (error)
461                         goto done;
462                 state_changed = 1;
463                 break;
464
465         default:
466                 break;
467         }
468
469         switch(cmd) {
470         case VM_RUN:
471                 vmrun = (struct vm_run *)data;
472                 error = vm_run(sc->vm, vmrun);
473                 break;
474         case VM_SUSPEND:
475                 vmsuspend = (struct vm_suspend *)data;
476                 error = vm_suspend(sc->vm, vmsuspend->how);
477                 break;
478         case VM_REINIT:
479                 error = vm_reinit(sc->vm);
480                 break;
481         case VM_STAT_DESC: {
482                 statdesc = (struct vm_stat_desc *)data;
483                 error = vmm_stat_desc_copy(statdesc->index,
484                                         statdesc->desc, sizeof(statdesc->desc));
485                 break;
486         }
487         case VM_STATS: {
488                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
489                 vmstats = (struct vm_stats *)data;
490                 getmicrotime(&vmstats->tv);
491                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
492                                       &vmstats->num_entries, vmstats->statbuf);
493                 break;
494         }
495         case VM_PPTDEV_MSI:
496                 pptmsi = (struct vm_pptdev_msi *)data;
497                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
498                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
499                                       pptmsi->addr, pptmsi->msg,
500                                       pptmsi->numvec);
501                 break;
502         case VM_PPTDEV_MSIX:
503                 pptmsix = (struct vm_pptdev_msix *)data;
504                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
505                                        pptmsix->bus, pptmsix->slot, 
506                                        pptmsix->func, pptmsix->idx,
507                                        pptmsix->addr, pptmsix->msg,
508                                        pptmsix->vector_control);
509                 break;
510         case VM_MAP_PPTDEV_MMIO:
511                 pptmmio = (struct vm_pptdev_mmio *)data;
512                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
513                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
514                                      pptmmio->hpa);
515                 break;
516         case VM_BIND_PPTDEV:
517                 pptdev = (struct vm_pptdev *)data;
518                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
519                                          pptdev->func);
520                 break;
521         case VM_UNBIND_PPTDEV:
522                 pptdev = (struct vm_pptdev *)data;
523                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
524                                            pptdev->func);
525                 break;
526         case VM_INJECT_EXCEPTION:
527                 vmexc = (struct vm_exception *)data;
528                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
529                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
530                     vmexc->restart_instruction);
531                 break;
532         case VM_INJECT_NMI:
533                 vmnmi = (struct vm_nmi *)data;
534                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
535                 break;
536         case VM_LAPIC_IRQ:
537                 vmirq = (struct vm_lapic_irq *)data;
538                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
539                 break;
540         case VM_LAPIC_LOCAL_IRQ:
541                 vmirq = (struct vm_lapic_irq *)data;
542                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
543                     vmirq->vector);
544                 break;
545         case VM_LAPIC_MSI:
546                 vmmsi = (struct vm_lapic_msi *)data;
547                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
548                 break;
549         case VM_IOAPIC_ASSERT_IRQ:
550                 ioapic_irq = (struct vm_ioapic_irq *)data;
551                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
552                 break;
553         case VM_IOAPIC_DEASSERT_IRQ:
554                 ioapic_irq = (struct vm_ioapic_irq *)data;
555                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
556                 break;
557         case VM_IOAPIC_PULSE_IRQ:
558                 ioapic_irq = (struct vm_ioapic_irq *)data;
559                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
560                 break;
561         case VM_IOAPIC_PINCOUNT:
562                 *(int *)data = vioapic_pincount(sc->vm);
563                 break;
564         case VM_ISA_ASSERT_IRQ:
565                 isa_irq = (struct vm_isa_irq *)data;
566                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
567                 if (error == 0 && isa_irq->ioapic_irq != -1)
568                         error = vioapic_assert_irq(sc->vm,
569                             isa_irq->ioapic_irq);
570                 break;
571         case VM_ISA_DEASSERT_IRQ:
572                 isa_irq = (struct vm_isa_irq *)data;
573                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
574                 if (error == 0 && isa_irq->ioapic_irq != -1)
575                         error = vioapic_deassert_irq(sc->vm,
576                             isa_irq->ioapic_irq);
577                 break;
578         case VM_ISA_PULSE_IRQ:
579                 isa_irq = (struct vm_isa_irq *)data;
580                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
581                 if (error == 0 && isa_irq->ioapic_irq != -1)
582                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
583                 break;
584         case VM_ISA_SET_IRQ_TRIGGER:
585                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
586                 error = vatpic_set_irq_trigger(sc->vm,
587                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
588                 break;
589         case VM_MMAP_GETNEXT:
590                 mm = (struct vm_memmap *)data;
591                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
592                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
593                 break;
594         case VM_MMAP_MEMSEG:
595                 mm = (struct vm_memmap *)data;
596                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
597                     mm->len, mm->prot, mm->flags);
598                 break;
599         case VM_ALLOC_MEMSEG:
600                 error = alloc_memseg(sc, (struct vm_memseg *)data);
601                 break;
602         case VM_GET_MEMSEG:
603                 error = get_memseg(sc, (struct vm_memseg *)data);
604                 break;
605         case VM_GET_REGISTER:
606                 vmreg = (struct vm_register *)data;
607                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
608                                         &vmreg->regval);
609                 break;
610         case VM_SET_REGISTER:
611                 vmreg = (struct vm_register *)data;
612                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
613                                         vmreg->regval);
614                 break;
615         case VM_SET_SEGMENT_DESCRIPTOR:
616                 vmsegdesc = (struct vm_seg_desc *)data;
617                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
618                                         vmsegdesc->regnum,
619                                         &vmsegdesc->desc);
620                 break;
621         case VM_GET_SEGMENT_DESCRIPTOR:
622                 vmsegdesc = (struct vm_seg_desc *)data;
623                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
624                                         vmsegdesc->regnum,
625                                         &vmsegdesc->desc);
626                 break;
627         case VM_GET_REGISTER_SET:
628                 vmregset = (struct vm_register_set *)data;
629                 if (vmregset->count > VM_REG_LAST) {
630                         error = EINVAL;
631                         break;
632                 }
633                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
634                     M_WAITOK);
635                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
636                     M_WAITOK);
637                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
638                     vmregset->count);
639                 if (error == 0)
640                         error = vm_get_register_set(sc->vm, vmregset->cpuid,
641                             vmregset->count, regnums, regvals);
642                 if (error == 0)
643                         error = copyout(regvals, vmregset->regvals,
644                             sizeof(regvals[0]) * vmregset->count);
645                 free(regvals, M_VMMDEV);
646                 free(regnums, M_VMMDEV);
647                 break;
648         case VM_SET_REGISTER_SET:
649                 vmregset = (struct vm_register_set *)data;
650                 if (vmregset->count > VM_REG_LAST) {
651                         error = EINVAL;
652                         break;
653                 }
654                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
655                     M_WAITOK);
656                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
657                     M_WAITOK);
658                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
659                     vmregset->count);
660                 if (error == 0)
661                         error = copyin(vmregset->regvals, regvals,
662                             sizeof(regvals[0]) * vmregset->count);
663                 if (error == 0)
664                         error = vm_set_register_set(sc->vm, vmregset->cpuid,
665                             vmregset->count, regnums, regvals);
666                 free(regvals, M_VMMDEV);
667                 free(regnums, M_VMMDEV);
668                 break;
669         case VM_GET_CAPABILITY:
670                 vmcap = (struct vm_capability *)data;
671                 error = vm_get_capability(sc->vm, vmcap->cpuid,
672                                           vmcap->captype,
673                                           &vmcap->capval);
674                 break;
675         case VM_SET_CAPABILITY:
676                 vmcap = (struct vm_capability *)data;
677                 error = vm_set_capability(sc->vm, vmcap->cpuid,
678                                           vmcap->captype,
679                                           vmcap->capval);
680                 break;
681         case VM_SET_X2APIC_STATE:
682                 x2apic = (struct vm_x2apic *)data;
683                 error = vm_set_x2apic_state(sc->vm,
684                                             x2apic->cpuid, x2apic->state);
685                 break;
686         case VM_GET_X2APIC_STATE:
687                 x2apic = (struct vm_x2apic *)data;
688                 error = vm_get_x2apic_state(sc->vm,
689                                             x2apic->cpuid, &x2apic->state);
690                 break;
691         case VM_GET_GPA_PMAP:
692                 gpapte = (struct vm_gpa_pte *)data;
693                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
694                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
695                 error = 0;
696                 break;
697         case VM_GET_HPET_CAPABILITIES:
698                 error = vhpet_getcap((struct vm_hpet_cap *)data);
699                 break;
700         case VM_GLA2GPA: {
701                 CTASSERT(PROT_READ == VM_PROT_READ);
702                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
703                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
704                 gg = (struct vm_gla2gpa *)data;
705                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
706                     gg->prot, &gg->gpa, &gg->fault);
707                 KASSERT(error == 0 || error == EFAULT,
708                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
709                 break;
710         }
711         case VM_GLA2GPA_NOFAULT:
712                 gg = (struct vm_gla2gpa *)data;
713                 error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
714                     gg->gla, gg->prot, &gg->gpa, &gg->fault);
715                 KASSERT(error == 0 || error == EFAULT,
716                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
717                 break;
718         case VM_ACTIVATE_CPU:
719                 vac = (struct vm_activate_cpu *)data;
720                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
721                 break;
722         case VM_GET_CPUS:
723                 error = 0;
724                 vm_cpuset = (struct vm_cpuset *)data;
725                 size = vm_cpuset->cpusetsize;
726                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
727                         error = ERANGE;
728                         break;
729                 }
730                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
731                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
732                         *cpuset = vm_active_cpus(sc->vm);
733                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
734                         *cpuset = vm_suspended_cpus(sc->vm);
735                 else if (vm_cpuset->which == VM_DEBUG_CPUS)
736                         *cpuset = vm_debug_cpus(sc->vm);
737                 else
738                         error = EINVAL;
739                 if (error == 0)
740                         error = copyout(cpuset, vm_cpuset->cpus, size);
741                 free(cpuset, M_TEMP);
742                 break;
743         case VM_SUSPEND_CPU:
744                 vac = (struct vm_activate_cpu *)data;
745                 error = vm_suspend_cpu(sc->vm, vac->vcpuid);
746                 break;
747         case VM_RESUME_CPU:
748                 vac = (struct vm_activate_cpu *)data;
749                 error = vm_resume_cpu(sc->vm, vac->vcpuid);
750                 break;
751         case VM_SET_INTINFO:
752                 vmii = (struct vm_intinfo *)data;
753                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
754                 break;
755         case VM_GET_INTINFO:
756                 vmii = (struct vm_intinfo *)data;
757                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
758                     &vmii->info2);
759                 break;
760         case VM_RTC_WRITE:
761                 rtcdata = (struct vm_rtc_data *)data;
762                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
763                     rtcdata->value);
764                 break;
765         case VM_RTC_READ:
766                 rtcdata = (struct vm_rtc_data *)data;
767                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
768                     &rtcdata->value);
769                 break;
770         case VM_RTC_SETTIME:
771                 rtctime = (struct vm_rtc_time *)data;
772                 error = vrtc_set_time(sc->vm, rtctime->secs);
773                 break;
774         case VM_RTC_GETTIME:
775                 error = 0;
776                 rtctime = (struct vm_rtc_time *)data;
777                 rtctime->secs = vrtc_get_time(sc->vm);
778                 break;
779         case VM_RESTART_INSTRUCTION:
780                 error = vm_restart_instruction(sc->vm, vcpu);
781                 break;
782         case VM_SET_TOPOLOGY:
783                 topology = (struct vm_cpu_topology *)data;
784                 error = vm_set_topology(sc->vm, topology->sockets,
785                     topology->cores, topology->threads, topology->maxcpus);
786                 break;
787         case VM_GET_TOPOLOGY:
788                 topology = (struct vm_cpu_topology *)data;
789                 vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
790                     &topology->threads, &topology->maxcpus);
791                 error = 0;
792                 break;
793 #ifdef BHYVE_SNAPSHOT
794         case VM_SNAPSHOT_REQ:
795                 snapshot_meta = (struct vm_snapshot_meta *)data;
796                 error = vm_snapshot_req(sc->vm, snapshot_meta);
797                 break;
798         case VM_RESTORE_TIME:
799                 error = vm_restore_time(sc->vm);
800                 break;
801 #endif
802         default:
803                 error = ENOTTY;
804                 break;
805         }
806
807         if (state_changed == 1)
808                 vcpu_unlock_one(sc, vcpu);
809         else if (state_changed == 2)
810                 vcpu_unlock_all(sc);
811
812 done:
813         /*
814          * Make sure that no handler returns a kernel-internal
815          * error value to userspace.
816          */
817         KASSERT(error == ERESTART || error >= 0,
818             ("vmmdev_ioctl: invalid error return %d", error));
819         return (error);
820 }
821
822 static int
823 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
824     struct vm_object **objp, int nprot)
825 {
826         struct vmmdev_softc *sc;
827         vm_paddr_t gpa;
828         size_t len;
829         vm_ooffset_t segoff, first, last;
830         int error, found, segid;
831         uint16_t lastcpu;
832         bool sysmem;
833
834         error = vmm_priv_check(curthread->td_ucred);
835         if (error)
836                 return (error);
837
838         first = *offset;
839         last = first + mapsize;
840         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
841                 return (EINVAL);
842
843         sc = vmmdev_lookup2(cdev);
844         if (sc == NULL) {
845                 /* virtual machine is in the process of being created */
846                 return (EINVAL);
847         }
848
849         /*
850          * Get a read lock on the guest memory map by freezing any vcpu.
851          */
852         lastcpu = vm_get_maxcpus(sc->vm) - 1;
853         error = vcpu_lock_one(sc, lastcpu);
854         if (error)
855                 return (error);
856
857         gpa = 0;
858         found = 0;
859         while (!found) {
860                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
861                     NULL, NULL);
862                 if (error)
863                         break;
864
865                 if (first >= gpa && last <= gpa + len)
866                         found = 1;
867                 else
868                         gpa += len;
869         }
870
871         if (found) {
872                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
873                 KASSERT(error == 0 && *objp != NULL,
874                     ("%s: invalid memory segment %d", __func__, segid));
875                 if (sysmem) {
876                         vm_object_reference(*objp);
877                         *offset = segoff + (first - gpa);
878                 } else {
879                         error = EINVAL;
880                 }
881         }
882         vcpu_unlock_one(sc, lastcpu);
883         return (error);
884 }
885
886 static void
887 vmmdev_destroy(void *arg)
888 {
889         struct vmmdev_softc *sc = arg;
890         struct devmem_softc *dsc;
891         int error;
892
893         error = vcpu_lock_all(sc);
894         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
895
896         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
897                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
898                 SLIST_REMOVE_HEAD(&sc->devmem, link);
899                 free(dsc->name, M_VMMDEV);
900                 free(dsc, M_VMMDEV);
901         }
902
903         if (sc->cdev != NULL)
904                 destroy_dev(sc->cdev);
905
906         if (sc->vm != NULL)
907                 vm_destroy(sc->vm);
908
909         if ((sc->flags & VSC_LINKED) != 0) {
910                 mtx_lock(&vmmdev_mtx);
911                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
912                 mtx_unlock(&vmmdev_mtx);
913         }
914
915         free(sc, M_VMMDEV);
916 }
917
918 static int
919 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
920 {
921         struct devmem_softc *dsc;
922         struct vmmdev_softc *sc;
923         struct cdev *cdev;
924         char *buf;
925         int error, buflen;
926
927         error = vmm_priv_check(req->td->td_ucred);
928         if (error)
929                 return (error);
930
931         buflen = VM_MAX_NAMELEN + 1;
932         buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
933         strlcpy(buf, "beavis", buflen);
934         error = sysctl_handle_string(oidp, buf, buflen, req);
935         if (error != 0 || req->newptr == NULL)
936                 goto out;
937
938         mtx_lock(&vmmdev_mtx);
939         sc = vmmdev_lookup(buf);
940         if (sc == NULL || sc->cdev == NULL) {
941                 mtx_unlock(&vmmdev_mtx);
942                 error = EINVAL;
943                 goto out;
944         }
945
946         /*
947          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
948          * goes down to 0 so we should not do it again in the callback.
949          *
950          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
951          * is scheduled for destruction.
952          */
953         cdev = sc->cdev;
954         sc->cdev = NULL;                
955         mtx_unlock(&vmmdev_mtx);
956
957         /*
958          * Schedule all cdevs to be destroyed:
959          *
960          * - any new operations on the 'cdev' will return an error (ENXIO).
961          *
962          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
963          *   be destroyed and the callback will be invoked in a taskqueue
964          *   context.
965          *
966          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
967          */
968         SLIST_FOREACH(dsc, &sc->devmem, link) {
969                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
970                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
971         }
972         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
973         error = 0;
974
975 out:
976         free(buf, M_VMMDEV);
977         return (error);
978 }
979 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
980     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
981     NULL, 0, sysctl_vmm_destroy, "A",
982     NULL);
983
984 static struct cdevsw vmmdevsw = {
985         .d_name         = "vmmdev",
986         .d_version      = D_VERSION,
987         .d_ioctl        = vmmdev_ioctl,
988         .d_mmap_single  = vmmdev_mmap_single,
989         .d_read         = vmmdev_rw,
990         .d_write        = vmmdev_rw,
991 };
992
993 static int
994 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
995 {
996         struct vm *vm;
997         struct cdev *cdev;
998         struct vmmdev_softc *sc, *sc2;
999         char *buf;
1000         int error, buflen;
1001
1002         error = vmm_priv_check(req->td->td_ucred);
1003         if (error)
1004                 return (error);
1005
1006         buflen = VM_MAX_NAMELEN + 1;
1007         buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1008         strlcpy(buf, "beavis", buflen);
1009         error = sysctl_handle_string(oidp, buf, buflen, req);
1010         if (error != 0 || req->newptr == NULL)
1011                 goto out;
1012
1013         mtx_lock(&vmmdev_mtx);
1014         sc = vmmdev_lookup(buf);
1015         mtx_unlock(&vmmdev_mtx);
1016         if (sc != NULL) {
1017                 error = EEXIST;
1018                 goto out;
1019         }
1020
1021         error = vm_create(buf, &vm);
1022         if (error != 0)
1023                 goto out;
1024
1025         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1026         sc->vm = vm;
1027         SLIST_INIT(&sc->devmem);
1028
1029         /*
1030          * Lookup the name again just in case somebody sneaked in when we
1031          * dropped the lock.
1032          */
1033         mtx_lock(&vmmdev_mtx);
1034         sc2 = vmmdev_lookup(buf);
1035         if (sc2 == NULL) {
1036                 SLIST_INSERT_HEAD(&head, sc, link);
1037                 sc->flags |= VSC_LINKED;
1038         }
1039         mtx_unlock(&vmmdev_mtx);
1040
1041         if (sc2 != NULL) {
1042                 vmmdev_destroy(sc);
1043                 error = EEXIST;
1044                 goto out;
1045         }
1046
1047         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
1048                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1049         if (error != 0) {
1050                 vmmdev_destroy(sc);
1051                 goto out;
1052         }
1053
1054         mtx_lock(&vmmdev_mtx);
1055         sc->cdev = cdev;
1056         sc->cdev->si_drv1 = sc;
1057         mtx_unlock(&vmmdev_mtx);
1058
1059 out:
1060         free(buf, M_VMMDEV);
1061         return (error);
1062 }
1063 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1064     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1065     NULL, 0, sysctl_vmm_create, "A",
1066     NULL);
1067
1068 void
1069 vmmdev_init(void)
1070 {
1071         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
1072         pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1073             "Allow use of vmm in a jail.");
1074 }
1075
1076 int
1077 vmmdev_cleanup(void)
1078 {
1079         int error;
1080
1081         if (SLIST_EMPTY(&head))
1082                 error = 0;
1083         else
1084                 error = EBUSY;
1085
1086         return (error);
1087 }
1088
1089 static int
1090 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1091     struct vm_object **objp, int nprot)
1092 {
1093         struct devmem_softc *dsc;
1094         vm_ooffset_t first, last;
1095         size_t seglen;
1096         int error;
1097         uint16_t lastcpu;
1098         bool sysmem;
1099
1100         dsc = cdev->si_drv1;
1101         if (dsc == NULL) {
1102                 /* 'cdev' has been created but is not ready for use */
1103                 return (ENXIO);
1104         }
1105
1106         first = *offset;
1107         last = *offset + len;
1108         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1109                 return (EINVAL);
1110
1111         lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
1112         error = vcpu_lock_one(dsc->sc, lastcpu);
1113         if (error)
1114                 return (error);
1115
1116         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1117         KASSERT(error == 0 && !sysmem && *objp != NULL,
1118             ("%s: invalid devmem segment %d", __func__, dsc->segid));
1119
1120         vcpu_unlock_one(dsc->sc, lastcpu);
1121
1122         if (seglen >= last) {
1123                 vm_object_reference(*objp);
1124                 return (0);
1125         } else {
1126                 return (EINVAL);
1127         }
1128 }
1129
1130 static struct cdevsw devmemsw = {
1131         .d_name         = "devmem",
1132         .d_version      = D_VERSION,
1133         .d_mmap_single  = devmem_mmap_single,
1134 };
1135
1136 static int
1137 devmem_create_cdev(const char *vmname, int segid, char *devname)
1138 {
1139         struct devmem_softc *dsc;
1140         struct vmmdev_softc *sc;
1141         struct cdev *cdev;
1142         int error;
1143
1144         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1145             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1146         if (error)
1147                 return (error);
1148
1149         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1150
1151         mtx_lock(&vmmdev_mtx);
1152         sc = vmmdev_lookup(vmname);
1153         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1154         if (sc->cdev == NULL) {
1155                 /* virtual machine is being created or destroyed */
1156                 mtx_unlock(&vmmdev_mtx);
1157                 free(dsc, M_VMMDEV);
1158                 destroy_dev_sched_cb(cdev, NULL, 0);
1159                 return (ENODEV);
1160         }
1161
1162         dsc->segid = segid;
1163         dsc->name = devname;
1164         dsc->cdev = cdev;
1165         dsc->sc = sc;
1166         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1167         mtx_unlock(&vmmdev_mtx);
1168
1169         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
1170         cdev->si_drv1 = dsc;
1171         return (0);
1172 }
1173
1174 static void
1175 devmem_destroy(void *arg)
1176 {
1177         struct devmem_softc *dsc = arg;
1178
1179         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1180         dsc->cdev = NULL;
1181         dsc->sc = NULL;
1182 }