]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
MFV r328251: 8652 Tautological comparisons with ZPROP_INVAL
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/queue.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/malloc.h>
40 #include <sys/conf.h>
41 #include <sys/sysctl.h>
42 #include <sys/libkern.h>
43 #include <sys/ioccom.h>
44 #include <sys/mman.h>
45 #include <sys/uio.h>
46
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <machine/vmm_instruction_emul.h>
55 #include <machine/vmm_dev.h>
56
57 #include "vmm_lapic.h"
58 #include "vmm_stat.h"
59 #include "vmm_mem.h"
60 #include "io/ppt.h"
61 #include "io/vatpic.h"
62 #include "io/vioapic.h"
63 #include "io/vhpet.h"
64 #include "io/vrtc.h"
65
66 struct devmem_softc {
67         int     segid;
68         char    *name;
69         struct cdev *cdev;
70         struct vmmdev_softc *sc;
71         SLIST_ENTRY(devmem_softc) link;
72 };
73
74 struct vmmdev_softc {
75         struct vm       *vm;            /* vm instance cookie */
76         struct cdev     *cdev;
77         SLIST_ENTRY(vmmdev_softc) link;
78         SLIST_HEAD(, devmem_softc) devmem;
79         int             flags;
80 };
81 #define VSC_LINKED              0x01
82
83 static SLIST_HEAD(, vmmdev_softc) head;
84
85 static struct mtx vmmdev_mtx;
86
87 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
88
89 SYSCTL_DECL(_hw_vmm);
90
91 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
92 static void devmem_destroy(void *arg);
93
94 static int
95 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
96 {
97         int error;
98
99         if (vcpu < 0 || vcpu >= VM_MAXCPU)
100                 return (EINVAL);
101
102         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
103         return (error);
104 }
105
106 static void
107 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
108 {
109         enum vcpu_state state;
110
111         state = vcpu_get_state(sc->vm, vcpu, NULL);
112         if (state != VCPU_FROZEN) {
113                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
114                     vcpu, state);
115         }
116
117         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
118 }
119
120 static int
121 vcpu_lock_all(struct vmmdev_softc *sc)
122 {
123         int error, vcpu;
124
125         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
126                 error = vcpu_lock_one(sc, vcpu);
127                 if (error)
128                         break;
129         }
130
131         if (error) {
132                 while (--vcpu >= 0)
133                         vcpu_unlock_one(sc, vcpu);
134         }
135
136         return (error);
137 }
138
139 static void
140 vcpu_unlock_all(struct vmmdev_softc *sc)
141 {
142         int vcpu;
143
144         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
145                 vcpu_unlock_one(sc, vcpu);
146 }
147
148 static struct vmmdev_softc *
149 vmmdev_lookup(const char *name)
150 {
151         struct vmmdev_softc *sc;
152
153 #ifdef notyet   /* XXX kernel is not compiled with invariants */
154         mtx_assert(&vmmdev_mtx, MA_OWNED);
155 #endif
156
157         SLIST_FOREACH(sc, &head, link) {
158                 if (strcmp(name, vm_name(sc->vm)) == 0)
159                         break;
160         }
161
162         return (sc);
163 }
164
165 static struct vmmdev_softc *
166 vmmdev_lookup2(struct cdev *cdev)
167 {
168
169         return (cdev->si_drv1);
170 }
171
172 static int
173 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
174 {
175         int error, off, c, prot;
176         vm_paddr_t gpa;
177         void *hpa, *cookie;
178         struct vmmdev_softc *sc;
179
180         sc = vmmdev_lookup2(cdev);
181         if (sc == NULL)
182                 return (ENXIO);
183
184         /*
185          * Get a read lock on the guest memory map by freezing any vcpu.
186          */
187         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
188         if (error)
189                 return (error);
190
191         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
192         while (uio->uio_resid > 0 && error == 0) {
193                 gpa = uio->uio_offset;
194                 off = gpa & PAGE_MASK;
195                 c = min(uio->uio_resid, PAGE_SIZE - off);
196
197                 /*
198                  * The VM has a hole in its physical memory map. If we want to
199                  * use 'dd' to inspect memory beyond the hole we need to
200                  * provide bogus data for memory that lies in the hole.
201                  *
202                  * Since this device does not support lseek(2), dd(1) will
203                  * read(2) blocks of data to simulate the lseek(2).
204                  */
205                 hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
206                 if (hpa == NULL) {
207                         if (uio->uio_rw == UIO_READ)
208                                 error = uiomove(__DECONST(void *, zero_region),
209                                     c, uio);
210                         else
211                                 error = EFAULT;
212                 } else {
213                         error = uiomove(hpa, c, uio);
214                         vm_gpa_release(cookie);
215                 }
216         }
217         vcpu_unlock_one(sc, VM_MAXCPU - 1);
218         return (error);
219 }
220
221 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
222
223 static int
224 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
225 {
226         struct devmem_softc *dsc;
227         int error;
228         bool sysmem;
229
230         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
231         if (error || mseg->len == 0)
232                 return (error);
233
234         if (!sysmem) {
235                 SLIST_FOREACH(dsc, &sc->devmem, link) {
236                         if (dsc->segid == mseg->segid)
237                                 break;
238                 }
239                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
240                     __func__, mseg->segid));
241                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
242         } else {
243                 bzero(mseg->name, sizeof(mseg->name));
244         }
245
246         return (error);
247 }
248
249 static int
250 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
251 {
252         char *name;
253         int error;
254         bool sysmem;
255
256         error = 0;
257         name = NULL;
258         sysmem = true;
259
260         if (VM_MEMSEG_NAME(mseg)) {
261                 sysmem = false;
262                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
263                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
264                 if (error)
265                         goto done;
266         }
267
268         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
269         if (error)
270                 goto done;
271
272         if (VM_MEMSEG_NAME(mseg)) {
273                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
274                 if (error)
275                         vm_free_memseg(sc->vm, mseg->segid);
276                 else
277                         name = NULL;    /* freed when 'cdev' is destroyed */
278         }
279 done:
280         free(name, M_VMMDEV);
281         return (error);
282 }
283
284 static int
285 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
286              struct thread *td)
287 {
288         int error, vcpu, state_changed, size;
289         cpuset_t *cpuset;
290         struct vmmdev_softc *sc;
291         struct vm_register *vmreg;
292         struct vm_seg_desc *vmsegdesc;
293         struct vm_run *vmrun;
294         struct vm_exception *vmexc;
295         struct vm_lapic_irq *vmirq;
296         struct vm_lapic_msi *vmmsi;
297         struct vm_ioapic_irq *ioapic_irq;
298         struct vm_isa_irq *isa_irq;
299         struct vm_isa_irq_trigger *isa_irq_trigger;
300         struct vm_capability *vmcap;
301         struct vm_pptdev *pptdev;
302         struct vm_pptdev_mmio *pptmmio;
303         struct vm_pptdev_msi *pptmsi;
304         struct vm_pptdev_msix *pptmsix;
305         struct vm_nmi *vmnmi;
306         struct vm_stats *vmstats;
307         struct vm_stat_desc *statdesc;
308         struct vm_x2apic *x2apic;
309         struct vm_gpa_pte *gpapte;
310         struct vm_suspend *vmsuspend;
311         struct vm_gla2gpa *gg;
312         struct vm_activate_cpu *vac;
313         struct vm_cpuset *vm_cpuset;
314         struct vm_intinfo *vmii;
315         struct vm_rtc_time *rtctime;
316         struct vm_rtc_data *rtcdata;
317         struct vm_memmap *mm;
318
319         sc = vmmdev_lookup2(cdev);
320         if (sc == NULL)
321                 return (ENXIO);
322
323         error = 0;
324         vcpu = -1;
325         state_changed = 0;
326
327         /*
328          * Some VMM ioctls can operate only on vcpus that are not running.
329          */
330         switch (cmd) {
331         case VM_RUN:
332         case VM_GET_REGISTER:
333         case VM_SET_REGISTER:
334         case VM_GET_SEGMENT_DESCRIPTOR:
335         case VM_SET_SEGMENT_DESCRIPTOR:
336         case VM_INJECT_EXCEPTION:
337         case VM_GET_CAPABILITY:
338         case VM_SET_CAPABILITY:
339         case VM_PPTDEV_MSI:
340         case VM_PPTDEV_MSIX:
341         case VM_SET_X2APIC_STATE:
342         case VM_GLA2GPA:
343         case VM_ACTIVATE_CPU:
344         case VM_SET_INTINFO:
345         case VM_GET_INTINFO:
346         case VM_RESTART_INSTRUCTION:
347                 /*
348                  * XXX fragile, handle with care
349                  * Assumes that the first field of the ioctl data is the vcpu.
350                  */
351                 vcpu = *(int *)data;
352                 error = vcpu_lock_one(sc, vcpu);
353                 if (error)
354                         goto done;
355                 state_changed = 1;
356                 break;
357
358         case VM_MAP_PPTDEV_MMIO:
359         case VM_BIND_PPTDEV:
360         case VM_UNBIND_PPTDEV:
361         case VM_ALLOC_MEMSEG:
362         case VM_MMAP_MEMSEG:
363         case VM_REINIT:
364                 /*
365                  * ioctls that operate on the entire virtual machine must
366                  * prevent all vcpus from running.
367                  */
368                 error = vcpu_lock_all(sc);
369                 if (error)
370                         goto done;
371                 state_changed = 2;
372                 break;
373
374         case VM_GET_MEMSEG:
375         case VM_MMAP_GETNEXT:
376                 /*
377                  * Lock a vcpu to make sure that the memory map cannot be
378                  * modified while it is being inspected.
379                  */
380                 vcpu = VM_MAXCPU - 1;
381                 error = vcpu_lock_one(sc, vcpu);
382                 if (error)
383                         goto done;
384                 state_changed = 1;
385                 break;
386
387         default:
388                 break;
389         }
390
391         switch(cmd) {
392         case VM_RUN:
393                 vmrun = (struct vm_run *)data;
394                 error = vm_run(sc->vm, vmrun);
395                 break;
396         case VM_SUSPEND:
397                 vmsuspend = (struct vm_suspend *)data;
398                 error = vm_suspend(sc->vm, vmsuspend->how);
399                 break;
400         case VM_REINIT:
401                 error = vm_reinit(sc->vm);
402                 break;
403         case VM_STAT_DESC: {
404                 statdesc = (struct vm_stat_desc *)data;
405                 error = vmm_stat_desc_copy(statdesc->index,
406                                         statdesc->desc, sizeof(statdesc->desc));
407                 break;
408         }
409         case VM_STATS: {
410                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
411                 vmstats = (struct vm_stats *)data;
412                 getmicrotime(&vmstats->tv);
413                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
414                                       &vmstats->num_entries, vmstats->statbuf);
415                 break;
416         }
417         case VM_PPTDEV_MSI:
418                 pptmsi = (struct vm_pptdev_msi *)data;
419                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
420                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
421                                       pptmsi->addr, pptmsi->msg,
422                                       pptmsi->numvec);
423                 break;
424         case VM_PPTDEV_MSIX:
425                 pptmsix = (struct vm_pptdev_msix *)data;
426                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
427                                        pptmsix->bus, pptmsix->slot, 
428                                        pptmsix->func, pptmsix->idx,
429                                        pptmsix->addr, pptmsix->msg,
430                                        pptmsix->vector_control);
431                 break;
432         case VM_MAP_PPTDEV_MMIO:
433                 pptmmio = (struct vm_pptdev_mmio *)data;
434                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
435                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
436                                      pptmmio->hpa);
437                 break;
438         case VM_BIND_PPTDEV:
439                 pptdev = (struct vm_pptdev *)data;
440                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
441                                          pptdev->func);
442                 break;
443         case VM_UNBIND_PPTDEV:
444                 pptdev = (struct vm_pptdev *)data;
445                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
446                                            pptdev->func);
447                 break;
448         case VM_INJECT_EXCEPTION:
449                 vmexc = (struct vm_exception *)data;
450                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
451                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
452                     vmexc->restart_instruction);
453                 break;
454         case VM_INJECT_NMI:
455                 vmnmi = (struct vm_nmi *)data;
456                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
457                 break;
458         case VM_LAPIC_IRQ:
459                 vmirq = (struct vm_lapic_irq *)data;
460                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
461                 break;
462         case VM_LAPIC_LOCAL_IRQ:
463                 vmirq = (struct vm_lapic_irq *)data;
464                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
465                     vmirq->vector);
466                 break;
467         case VM_LAPIC_MSI:
468                 vmmsi = (struct vm_lapic_msi *)data;
469                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
470                 break;
471         case VM_IOAPIC_ASSERT_IRQ:
472                 ioapic_irq = (struct vm_ioapic_irq *)data;
473                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
474                 break;
475         case VM_IOAPIC_DEASSERT_IRQ:
476                 ioapic_irq = (struct vm_ioapic_irq *)data;
477                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
478                 break;
479         case VM_IOAPIC_PULSE_IRQ:
480                 ioapic_irq = (struct vm_ioapic_irq *)data;
481                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
482                 break;
483         case VM_IOAPIC_PINCOUNT:
484                 *(int *)data = vioapic_pincount(sc->vm);
485                 break;
486         case VM_ISA_ASSERT_IRQ:
487                 isa_irq = (struct vm_isa_irq *)data;
488                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
489                 if (error == 0 && isa_irq->ioapic_irq != -1)
490                         error = vioapic_assert_irq(sc->vm,
491                             isa_irq->ioapic_irq);
492                 break;
493         case VM_ISA_DEASSERT_IRQ:
494                 isa_irq = (struct vm_isa_irq *)data;
495                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
496                 if (error == 0 && isa_irq->ioapic_irq != -1)
497                         error = vioapic_deassert_irq(sc->vm,
498                             isa_irq->ioapic_irq);
499                 break;
500         case VM_ISA_PULSE_IRQ:
501                 isa_irq = (struct vm_isa_irq *)data;
502                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
503                 if (error == 0 && isa_irq->ioapic_irq != -1)
504                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
505                 break;
506         case VM_ISA_SET_IRQ_TRIGGER:
507                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
508                 error = vatpic_set_irq_trigger(sc->vm,
509                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
510                 break;
511         case VM_MMAP_GETNEXT:
512                 mm = (struct vm_memmap *)data;
513                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
514                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
515                 break;
516         case VM_MMAP_MEMSEG:
517                 mm = (struct vm_memmap *)data;
518                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
519                     mm->len, mm->prot, mm->flags);
520                 break;
521         case VM_ALLOC_MEMSEG:
522                 error = alloc_memseg(sc, (struct vm_memseg *)data);
523                 break;
524         case VM_GET_MEMSEG:
525                 error = get_memseg(sc, (struct vm_memseg *)data);
526                 break;
527         case VM_GET_REGISTER:
528                 vmreg = (struct vm_register *)data;
529                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
530                                         &vmreg->regval);
531                 break;
532         case VM_SET_REGISTER:
533                 vmreg = (struct vm_register *)data;
534                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
535                                         vmreg->regval);
536                 break;
537         case VM_SET_SEGMENT_DESCRIPTOR:
538                 vmsegdesc = (struct vm_seg_desc *)data;
539                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
540                                         vmsegdesc->regnum,
541                                         &vmsegdesc->desc);
542                 break;
543         case VM_GET_SEGMENT_DESCRIPTOR:
544                 vmsegdesc = (struct vm_seg_desc *)data;
545                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
546                                         vmsegdesc->regnum,
547                                         &vmsegdesc->desc);
548                 break;
549         case VM_GET_CAPABILITY:
550                 vmcap = (struct vm_capability *)data;
551                 error = vm_get_capability(sc->vm, vmcap->cpuid,
552                                           vmcap->captype,
553                                           &vmcap->capval);
554                 break;
555         case VM_SET_CAPABILITY:
556                 vmcap = (struct vm_capability *)data;
557                 error = vm_set_capability(sc->vm, vmcap->cpuid,
558                                           vmcap->captype,
559                                           vmcap->capval);
560                 break;
561         case VM_SET_X2APIC_STATE:
562                 x2apic = (struct vm_x2apic *)data;
563                 error = vm_set_x2apic_state(sc->vm,
564                                             x2apic->cpuid, x2apic->state);
565                 break;
566         case VM_GET_X2APIC_STATE:
567                 x2apic = (struct vm_x2apic *)data;
568                 error = vm_get_x2apic_state(sc->vm,
569                                             x2apic->cpuid, &x2apic->state);
570                 break;
571         case VM_GET_GPA_PMAP:
572                 gpapte = (struct vm_gpa_pte *)data;
573                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
574                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
575                 error = 0;
576                 break;
577         case VM_GET_HPET_CAPABILITIES:
578                 error = vhpet_getcap((struct vm_hpet_cap *)data);
579                 break;
580         case VM_GLA2GPA: {
581                 CTASSERT(PROT_READ == VM_PROT_READ);
582                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
583                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
584                 gg = (struct vm_gla2gpa *)data;
585                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
586                     gg->prot, &gg->gpa, &gg->fault);
587                 KASSERT(error == 0 || error == EFAULT,
588                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
589                 break;
590         }
591         case VM_ACTIVATE_CPU:
592                 vac = (struct vm_activate_cpu *)data;
593                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
594                 break;
595         case VM_GET_CPUS:
596                 error = 0;
597                 vm_cpuset = (struct vm_cpuset *)data;
598                 size = vm_cpuset->cpusetsize;
599                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
600                         error = ERANGE;
601                         break;
602                 }
603                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
604                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
605                         *cpuset = vm_active_cpus(sc->vm);
606                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
607                         *cpuset = vm_suspended_cpus(sc->vm);
608                 else
609                         error = EINVAL;
610                 if (error == 0)
611                         error = copyout(cpuset, vm_cpuset->cpus, size);
612                 free(cpuset, M_TEMP);
613                 break;
614         case VM_SET_INTINFO:
615                 vmii = (struct vm_intinfo *)data;
616                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
617                 break;
618         case VM_GET_INTINFO:
619                 vmii = (struct vm_intinfo *)data;
620                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
621                     &vmii->info2);
622                 break;
623         case VM_RTC_WRITE:
624                 rtcdata = (struct vm_rtc_data *)data;
625                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
626                     rtcdata->value);
627                 break;
628         case VM_RTC_READ:
629                 rtcdata = (struct vm_rtc_data *)data;
630                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
631                     &rtcdata->value);
632                 break;
633         case VM_RTC_SETTIME:
634                 rtctime = (struct vm_rtc_time *)data;
635                 error = vrtc_set_time(sc->vm, rtctime->secs);
636                 break;
637         case VM_RTC_GETTIME:
638                 error = 0;
639                 rtctime = (struct vm_rtc_time *)data;
640                 rtctime->secs = vrtc_get_time(sc->vm);
641                 break;
642         case VM_RESTART_INSTRUCTION:
643                 error = vm_restart_instruction(sc->vm, vcpu);
644                 break;
645         default:
646                 error = ENOTTY;
647                 break;
648         }
649
650         if (state_changed == 1)
651                 vcpu_unlock_one(sc, vcpu);
652         else if (state_changed == 2)
653                 vcpu_unlock_all(sc);
654
655 done:
656         /* Make sure that no handler returns a bogus value like ERESTART */
657         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
658         return (error);
659 }
660
661 static int
662 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
663     struct vm_object **objp, int nprot)
664 {
665         struct vmmdev_softc *sc;
666         vm_paddr_t gpa;
667         size_t len;
668         vm_ooffset_t segoff, first, last;
669         int error, found, segid;
670         bool sysmem;
671
672         first = *offset;
673         last = first + mapsize;
674         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
675                 return (EINVAL);
676
677         sc = vmmdev_lookup2(cdev);
678         if (sc == NULL) {
679                 /* virtual machine is in the process of being created */
680                 return (EINVAL);
681         }
682
683         /*
684          * Get a read lock on the guest memory map by freezing any vcpu.
685          */
686         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
687         if (error)
688                 return (error);
689
690         gpa = 0;
691         found = 0;
692         while (!found) {
693                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
694                     NULL, NULL);
695                 if (error)
696                         break;
697
698                 if (first >= gpa && last <= gpa + len)
699                         found = 1;
700                 else
701                         gpa += len;
702         }
703
704         if (found) {
705                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
706                 KASSERT(error == 0 && *objp != NULL,
707                     ("%s: invalid memory segment %d", __func__, segid));
708                 if (sysmem) {
709                         vm_object_reference(*objp);
710                         *offset = segoff + (first - gpa);
711                 } else {
712                         error = EINVAL;
713                 }
714         }
715         vcpu_unlock_one(sc, VM_MAXCPU - 1);
716         return (error);
717 }
718
719 static void
720 vmmdev_destroy(void *arg)
721 {
722         struct vmmdev_softc *sc = arg;
723         struct devmem_softc *dsc;
724         int error;
725
726         error = vcpu_lock_all(sc);
727         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
728
729         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
730                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
731                 SLIST_REMOVE_HEAD(&sc->devmem, link);
732                 free(dsc->name, M_VMMDEV);
733                 free(dsc, M_VMMDEV);
734         }
735
736         if (sc->cdev != NULL)
737                 destroy_dev(sc->cdev);
738
739         if (sc->vm != NULL)
740                 vm_destroy(sc->vm);
741
742         if ((sc->flags & VSC_LINKED) != 0) {
743                 mtx_lock(&vmmdev_mtx);
744                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
745                 mtx_unlock(&vmmdev_mtx);
746         }
747
748         free(sc, M_VMMDEV);
749 }
750
751 static int
752 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
753 {
754         int error;
755         char buf[VM_MAX_NAMELEN];
756         struct devmem_softc *dsc;
757         struct vmmdev_softc *sc;
758         struct cdev *cdev;
759
760         strlcpy(buf, "beavis", sizeof(buf));
761         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
762         if (error != 0 || req->newptr == NULL)
763                 return (error);
764
765         mtx_lock(&vmmdev_mtx);
766         sc = vmmdev_lookup(buf);
767         if (sc == NULL || sc->cdev == NULL) {
768                 mtx_unlock(&vmmdev_mtx);
769                 return (EINVAL);
770         }
771
772         /*
773          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
774          * goes down to 0 so we should not do it again in the callback.
775          *
776          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
777          * is scheduled for destruction.
778          */
779         cdev = sc->cdev;
780         sc->cdev = NULL;                
781         mtx_unlock(&vmmdev_mtx);
782
783         /*
784          * Schedule all cdevs to be destroyed:
785          *
786          * - any new operations on the 'cdev' will return an error (ENXIO).
787          *
788          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
789          *   be destroyed and the callback will be invoked in a taskqueue
790          *   context.
791          *
792          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
793          */
794         SLIST_FOREACH(dsc, &sc->devmem, link) {
795                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
796                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
797         }
798         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
799         return (0);
800 }
801 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
802             NULL, 0, sysctl_vmm_destroy, "A", NULL);
803
804 static struct cdevsw vmmdevsw = {
805         .d_name         = "vmmdev",
806         .d_version      = D_VERSION,
807         .d_ioctl        = vmmdev_ioctl,
808         .d_mmap_single  = vmmdev_mmap_single,
809         .d_read         = vmmdev_rw,
810         .d_write        = vmmdev_rw,
811 };
812
813 static int
814 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
815 {
816         int error;
817         struct vm *vm;
818         struct cdev *cdev;
819         struct vmmdev_softc *sc, *sc2;
820         char buf[VM_MAX_NAMELEN];
821
822         strlcpy(buf, "beavis", sizeof(buf));
823         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
824         if (error != 0 || req->newptr == NULL)
825                 return (error);
826
827         mtx_lock(&vmmdev_mtx);
828         sc = vmmdev_lookup(buf);
829         mtx_unlock(&vmmdev_mtx);
830         if (sc != NULL)
831                 return (EEXIST);
832
833         error = vm_create(buf, &vm);
834         if (error != 0)
835                 return (error);
836
837         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
838         sc->vm = vm;
839         SLIST_INIT(&sc->devmem);
840
841         /*
842          * Lookup the name again just in case somebody sneaked in when we
843          * dropped the lock.
844          */
845         mtx_lock(&vmmdev_mtx);
846         sc2 = vmmdev_lookup(buf);
847         if (sc2 == NULL) {
848                 SLIST_INSERT_HEAD(&head, sc, link);
849                 sc->flags |= VSC_LINKED;
850         }
851         mtx_unlock(&vmmdev_mtx);
852
853         if (sc2 != NULL) {
854                 vmmdev_destroy(sc);
855                 return (EEXIST);
856         }
857
858         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
859                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
860         if (error != 0) {
861                 vmmdev_destroy(sc);
862                 return (error);
863         }
864
865         mtx_lock(&vmmdev_mtx);
866         sc->cdev = cdev;
867         sc->cdev->si_drv1 = sc;
868         mtx_unlock(&vmmdev_mtx);
869
870         return (0);
871 }
872 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
873             NULL, 0, sysctl_vmm_create, "A", NULL);
874
875 void
876 vmmdev_init(void)
877 {
878         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
879 }
880
881 int
882 vmmdev_cleanup(void)
883 {
884         int error;
885
886         if (SLIST_EMPTY(&head))
887                 error = 0;
888         else
889                 error = EBUSY;
890
891         return (error);
892 }
893
894 static int
895 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
896     struct vm_object **objp, int nprot)
897 {
898         struct devmem_softc *dsc;
899         vm_ooffset_t first, last;
900         size_t seglen;
901         int error;
902         bool sysmem;
903
904         dsc = cdev->si_drv1;
905         if (dsc == NULL) {
906                 /* 'cdev' has been created but is not ready for use */
907                 return (ENXIO);
908         }
909
910         first = *offset;
911         last = *offset + len;
912         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
913                 return (EINVAL);
914
915         error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
916         if (error)
917                 return (error);
918
919         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
920         KASSERT(error == 0 && !sysmem && *objp != NULL,
921             ("%s: invalid devmem segment %d", __func__, dsc->segid));
922
923         vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
924
925         if (seglen >= last) {
926                 vm_object_reference(*objp);
927                 return (0);
928         } else {
929                 return (EINVAL);
930         }
931 }
932
933 static struct cdevsw devmemsw = {
934         .d_name         = "devmem",
935         .d_version      = D_VERSION,
936         .d_mmap_single  = devmem_mmap_single,
937 };
938
939 static int
940 devmem_create_cdev(const char *vmname, int segid, char *devname)
941 {
942         struct devmem_softc *dsc;
943         struct vmmdev_softc *sc;
944         struct cdev *cdev;
945         int error;
946
947         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
948             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
949         if (error)
950                 return (error);
951
952         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
953
954         mtx_lock(&vmmdev_mtx);
955         sc = vmmdev_lookup(vmname);
956         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
957         if (sc->cdev == NULL) {
958                 /* virtual machine is being created or destroyed */
959                 mtx_unlock(&vmmdev_mtx);
960                 free(dsc, M_VMMDEV);
961                 destroy_dev_sched_cb(cdev, NULL, 0);
962                 return (ENODEV);
963         }
964
965         dsc->segid = segid;
966         dsc->name = devname;
967         dsc->cdev = cdev;
968         dsc->sc = sc;
969         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
970         mtx_unlock(&vmmdev_mtx);
971
972         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
973         cdev->si_drv1 = dsc;
974         return (0);
975 }
976
977 static void
978 devmem_destroy(void *arg)
979 {
980         struct devmem_softc *dsc = arg;
981
982         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
983         dsc->cdev = NULL;
984         dsc->sc = NULL;
985 }