]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
MFC: r332298,333712,334199,334216,334219 bhyve cpu topology
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/queue.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/malloc.h>
38 #include <sys/conf.h>
39 #include <sys/sysctl.h>
40 #include <sys/libkern.h>
41 #include <sys/ioccom.h>
42 #include <sys/mman.h>
43 #include <sys/uio.h>
44
45 #include <vm/vm.h>
46 #include <vm/pmap.h>
47 #include <vm/vm_map.h>
48 #include <vm/vm_object.h>
49
50 #include <machine/vmparam.h>
51 #include <machine/vmm.h>
52 #include <machine/vmm_instruction_emul.h>
53 #include <machine/vmm_dev.h>
54
55 #include "vmm_lapic.h"
56 #include "vmm_stat.h"
57 #include "vmm_mem.h"
58 #include "io/ppt.h"
59 #include "io/vatpic.h"
60 #include "io/vioapic.h"
61 #include "io/vhpet.h"
62 #include "io/vrtc.h"
63
64 struct devmem_softc {
65         int     segid;
66         char    *name;
67         struct cdev *cdev;
68         struct vmmdev_softc *sc;
69         SLIST_ENTRY(devmem_softc) link;
70 };
71
72 struct vmmdev_softc {
73         struct vm       *vm;            /* vm instance cookie */
74         struct cdev     *cdev;
75         SLIST_ENTRY(vmmdev_softc) link;
76         SLIST_HEAD(, devmem_softc) devmem;
77         int             flags;
78 };
79 #define VSC_LINKED              0x01
80
81 static SLIST_HEAD(, vmmdev_softc) head;
82
83 static struct mtx vmmdev_mtx;
84
85 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
86
87 SYSCTL_DECL(_hw_vmm);
88
89 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
90 static void devmem_destroy(void *arg);
91
92 static int
93 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
94 {
95         int error;
96
97         if (vcpu < 0 || vcpu >= VM_MAXCPU)
98                 return (EINVAL);
99
100         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
101         return (error);
102 }
103
104 static void
105 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
106 {
107         enum vcpu_state state;
108
109         state = vcpu_get_state(sc->vm, vcpu, NULL);
110         if (state != VCPU_FROZEN) {
111                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
112                     vcpu, state);
113         }
114
115         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
116 }
117
118 static int
119 vcpu_lock_all(struct vmmdev_softc *sc)
120 {
121         int error, vcpu;
122
123         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
124                 error = vcpu_lock_one(sc, vcpu);
125                 if (error)
126                         break;
127         }
128
129         if (error) {
130                 while (--vcpu >= 0)
131                         vcpu_unlock_one(sc, vcpu);
132         }
133
134         return (error);
135 }
136
137 static void
138 vcpu_unlock_all(struct vmmdev_softc *sc)
139 {
140         int vcpu;
141
142         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
143                 vcpu_unlock_one(sc, vcpu);
144 }
145
146 static struct vmmdev_softc *
147 vmmdev_lookup(const char *name)
148 {
149         struct vmmdev_softc *sc;
150
151 #ifdef notyet   /* XXX kernel is not compiled with invariants */
152         mtx_assert(&vmmdev_mtx, MA_OWNED);
153 #endif
154
155         SLIST_FOREACH(sc, &head, link) {
156                 if (strcmp(name, vm_name(sc->vm)) == 0)
157                         break;
158         }
159
160         return (sc);
161 }
162
163 static struct vmmdev_softc *
164 vmmdev_lookup2(struct cdev *cdev)
165 {
166
167         return (cdev->si_drv1);
168 }
169
170 static int
171 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
172 {
173         int error, off, c, prot;
174         vm_paddr_t gpa, maxaddr;
175         void *hpa, *cookie;
176         struct vmmdev_softc *sc;
177
178         sc = vmmdev_lookup2(cdev);
179         if (sc == NULL)
180                 return (ENXIO);
181
182         /*
183          * Get a read lock on the guest memory map by freezing any vcpu.
184          */
185         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
186         if (error)
187                 return (error);
188
189         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
190         maxaddr = vmm_sysmem_maxaddr(sc->vm);
191         while (uio->uio_resid > 0 && error == 0) {
192                 gpa = uio->uio_offset;
193                 off = gpa & PAGE_MASK;
194                 c = min(uio->uio_resid, PAGE_SIZE - off);
195
196                 /*
197                  * The VM has a hole in its physical memory map. If we want to
198                  * use 'dd' to inspect memory beyond the hole we need to
199                  * provide bogus data for memory that lies in the hole.
200                  *
201                  * Since this device does not support lseek(2), dd(1) will
202                  * read(2) blocks of data to simulate the lseek(2).
203                  */
204                 hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
205                 if (hpa == NULL) {
206                         if (uio->uio_rw == UIO_READ && gpa < maxaddr)
207                                 error = uiomove(__DECONST(void *, zero_region),
208                                     c, uio);
209                         else
210                                 error = EFAULT;
211                 } else {
212                         error = uiomove(hpa, c, uio);
213                         vm_gpa_release(cookie);
214                 }
215         }
216         vcpu_unlock_one(sc, VM_MAXCPU - 1);
217         return (error);
218 }
219
220 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
221
222 static int
223 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
224 {
225         struct devmem_softc *dsc;
226         int error;
227         bool sysmem;
228
229         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
230         if (error || mseg->len == 0)
231                 return (error);
232
233         if (!sysmem) {
234                 SLIST_FOREACH(dsc, &sc->devmem, link) {
235                         if (dsc->segid == mseg->segid)
236                                 break;
237                 }
238                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
239                     __func__, mseg->segid));
240                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
241         } else {
242                 bzero(mseg->name, sizeof(mseg->name));
243         }
244
245         return (error);
246 }
247
248 static int
249 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
250 {
251         char *name;
252         int error;
253         bool sysmem;
254
255         error = 0;
256         name = NULL;
257         sysmem = true;
258
259         if (VM_MEMSEG_NAME(mseg)) {
260                 sysmem = false;
261                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
262                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
263                 if (error)
264                         goto done;
265         }
266
267         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
268         if (error)
269                 goto done;
270
271         if (VM_MEMSEG_NAME(mseg)) {
272                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
273                 if (error)
274                         vm_free_memseg(sc->vm, mseg->segid);
275                 else
276                         name = NULL;    /* freed when 'cdev' is destroyed */
277         }
278 done:
279         free(name, M_VMMDEV);
280         return (error);
281 }
282
283 static int
284 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
285              struct thread *td)
286 {
287         int error, vcpu, state_changed, size;
288         cpuset_t *cpuset;
289         struct vmmdev_softc *sc;
290         struct vm_register *vmreg;
291         struct vm_seg_desc *vmsegdesc;
292         struct vm_run *vmrun;
293         struct vm_exception *vmexc;
294         struct vm_lapic_irq *vmirq;
295         struct vm_lapic_msi *vmmsi;
296         struct vm_ioapic_irq *ioapic_irq;
297         struct vm_isa_irq *isa_irq;
298         struct vm_isa_irq_trigger *isa_irq_trigger;
299         struct vm_capability *vmcap;
300         struct vm_pptdev *pptdev;
301         struct vm_pptdev_mmio *pptmmio;
302         struct vm_pptdev_msi *pptmsi;
303         struct vm_pptdev_msix *pptmsix;
304         struct vm_nmi *vmnmi;
305         struct vm_stats *vmstats;
306         struct vm_stat_desc *statdesc;
307         struct vm_x2apic *x2apic;
308         struct vm_gpa_pte *gpapte;
309         struct vm_suspend *vmsuspend;
310         struct vm_gla2gpa *gg;
311         struct vm_activate_cpu *vac;
312         struct vm_cpuset *vm_cpuset;
313         struct vm_intinfo *vmii;
314         struct vm_rtc_time *rtctime;
315         struct vm_rtc_data *rtcdata;
316         struct vm_memmap *mm;
317         struct vm_cpu_topology *topology;
318
319         sc = vmmdev_lookup2(cdev);
320         if (sc == NULL)
321                 return (ENXIO);
322
323         error = 0;
324         vcpu = -1;
325         state_changed = 0;
326
327         /*
328          * Some VMM ioctls can operate only on vcpus that are not running.
329          */
330         switch (cmd) {
331         case VM_RUN:
332         case VM_GET_REGISTER:
333         case VM_SET_REGISTER:
334         case VM_GET_SEGMENT_DESCRIPTOR:
335         case VM_SET_SEGMENT_DESCRIPTOR:
336         case VM_INJECT_EXCEPTION:
337         case VM_GET_CAPABILITY:
338         case VM_SET_CAPABILITY:
339         case VM_PPTDEV_MSI:
340         case VM_PPTDEV_MSIX:
341         case VM_SET_X2APIC_STATE:
342         case VM_GLA2GPA:
343         case VM_ACTIVATE_CPU:
344         case VM_SET_INTINFO:
345         case VM_GET_INTINFO:
346         case VM_RESTART_INSTRUCTION:
347                 /*
348                  * XXX fragile, handle with care
349                  * Assumes that the first field of the ioctl data is the vcpu.
350                  */
351                 vcpu = *(int *)data;
352                 error = vcpu_lock_one(sc, vcpu);
353                 if (error)
354                         goto done;
355                 state_changed = 1;
356                 break;
357
358         case VM_MAP_PPTDEV_MMIO:
359         case VM_BIND_PPTDEV:
360         case VM_UNBIND_PPTDEV:
361         case VM_ALLOC_MEMSEG:
362         case VM_MMAP_MEMSEG:
363         case VM_REINIT:
364                 /*
365                  * ioctls that operate on the entire virtual machine must
366                  * prevent all vcpus from running.
367                  */
368                 error = vcpu_lock_all(sc);
369                 if (error)
370                         goto done;
371                 state_changed = 2;
372                 break;
373
374         case VM_GET_MEMSEG:
375         case VM_MMAP_GETNEXT:
376                 /*
377                  * Lock a vcpu to make sure that the memory map cannot be
378                  * modified while it is being inspected.
379                  */
380                 vcpu = VM_MAXCPU - 1;
381                 error = vcpu_lock_one(sc, vcpu);
382                 if (error)
383                         goto done;
384                 state_changed = 1;
385                 break;
386
387         default:
388                 break;
389         }
390
391         switch(cmd) {
392         case VM_RUN:
393                 vmrun = (struct vm_run *)data;
394                 error = vm_run(sc->vm, vmrun);
395                 break;
396         case VM_SUSPEND:
397                 vmsuspend = (struct vm_suspend *)data;
398                 error = vm_suspend(sc->vm, vmsuspend->how);
399                 break;
400         case VM_REINIT:
401                 error = vm_reinit(sc->vm);
402                 break;
403         case VM_STAT_DESC: {
404                 statdesc = (struct vm_stat_desc *)data;
405                 error = vmm_stat_desc_copy(statdesc->index,
406                                         statdesc->desc, sizeof(statdesc->desc));
407                 break;
408         }
409         case VM_STATS: {
410                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
411                 vmstats = (struct vm_stats *)data;
412                 getmicrotime(&vmstats->tv);
413                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
414                                       &vmstats->num_entries, vmstats->statbuf);
415                 break;
416         }
417         case VM_PPTDEV_MSI:
418                 pptmsi = (struct vm_pptdev_msi *)data;
419                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
420                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
421                                       pptmsi->addr, pptmsi->msg,
422                                       pptmsi->numvec);
423                 break;
424         case VM_PPTDEV_MSIX:
425                 pptmsix = (struct vm_pptdev_msix *)data;
426                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
427                                        pptmsix->bus, pptmsix->slot, 
428                                        pptmsix->func, pptmsix->idx,
429                                        pptmsix->addr, pptmsix->msg,
430                                        pptmsix->vector_control);
431                 break;
432         case VM_MAP_PPTDEV_MMIO:
433                 pptmmio = (struct vm_pptdev_mmio *)data;
434                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
435                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
436                                      pptmmio->hpa);
437                 break;
438         case VM_BIND_PPTDEV:
439                 pptdev = (struct vm_pptdev *)data;
440                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
441                                          pptdev->func);
442                 break;
443         case VM_UNBIND_PPTDEV:
444                 pptdev = (struct vm_pptdev *)data;
445                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
446                                            pptdev->func);
447                 break;
448         case VM_INJECT_EXCEPTION:
449                 vmexc = (struct vm_exception *)data;
450                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
451                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
452                     vmexc->restart_instruction);
453                 break;
454         case VM_INJECT_NMI:
455                 vmnmi = (struct vm_nmi *)data;
456                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
457                 break;
458         case VM_LAPIC_IRQ:
459                 vmirq = (struct vm_lapic_irq *)data;
460                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
461                 break;
462         case VM_LAPIC_LOCAL_IRQ:
463                 vmirq = (struct vm_lapic_irq *)data;
464                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
465                     vmirq->vector);
466                 break;
467         case VM_LAPIC_MSI:
468                 vmmsi = (struct vm_lapic_msi *)data;
469                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
470                 break;
471         case VM_IOAPIC_ASSERT_IRQ:
472                 ioapic_irq = (struct vm_ioapic_irq *)data;
473                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
474                 break;
475         case VM_IOAPIC_DEASSERT_IRQ:
476                 ioapic_irq = (struct vm_ioapic_irq *)data;
477                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
478                 break;
479         case VM_IOAPIC_PULSE_IRQ:
480                 ioapic_irq = (struct vm_ioapic_irq *)data;
481                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
482                 break;
483         case VM_IOAPIC_PINCOUNT:
484                 *(int *)data = vioapic_pincount(sc->vm);
485                 break;
486         case VM_ISA_ASSERT_IRQ:
487                 isa_irq = (struct vm_isa_irq *)data;
488                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
489                 if (error == 0 && isa_irq->ioapic_irq != -1)
490                         error = vioapic_assert_irq(sc->vm,
491                             isa_irq->ioapic_irq);
492                 break;
493         case VM_ISA_DEASSERT_IRQ:
494                 isa_irq = (struct vm_isa_irq *)data;
495                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
496                 if (error == 0 && isa_irq->ioapic_irq != -1)
497                         error = vioapic_deassert_irq(sc->vm,
498                             isa_irq->ioapic_irq);
499                 break;
500         case VM_ISA_PULSE_IRQ:
501                 isa_irq = (struct vm_isa_irq *)data;
502                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
503                 if (error == 0 && isa_irq->ioapic_irq != -1)
504                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
505                 break;
506         case VM_ISA_SET_IRQ_TRIGGER:
507                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
508                 error = vatpic_set_irq_trigger(sc->vm,
509                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
510                 break;
511         case VM_MMAP_GETNEXT:
512                 mm = (struct vm_memmap *)data;
513                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
514                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
515                 break;
516         case VM_MMAP_MEMSEG:
517                 mm = (struct vm_memmap *)data;
518                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
519                     mm->len, mm->prot, mm->flags);
520                 break;
521         case VM_ALLOC_MEMSEG:
522                 error = alloc_memseg(sc, (struct vm_memseg *)data);
523                 break;
524         case VM_GET_MEMSEG:
525                 error = get_memseg(sc, (struct vm_memseg *)data);
526                 break;
527         case VM_GET_REGISTER:
528                 vmreg = (struct vm_register *)data;
529                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
530                                         &vmreg->regval);
531                 break;
532         case VM_SET_REGISTER:
533                 vmreg = (struct vm_register *)data;
534                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
535                                         vmreg->regval);
536                 break;
537         case VM_SET_SEGMENT_DESCRIPTOR:
538                 vmsegdesc = (struct vm_seg_desc *)data;
539                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
540                                         vmsegdesc->regnum,
541                                         &vmsegdesc->desc);
542                 break;
543         case VM_GET_SEGMENT_DESCRIPTOR:
544                 vmsegdesc = (struct vm_seg_desc *)data;
545                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
546                                         vmsegdesc->regnum,
547                                         &vmsegdesc->desc);
548                 break;
549         case VM_GET_CAPABILITY:
550                 vmcap = (struct vm_capability *)data;
551                 error = vm_get_capability(sc->vm, vmcap->cpuid,
552                                           vmcap->captype,
553                                           &vmcap->capval);
554                 break;
555         case VM_SET_CAPABILITY:
556                 vmcap = (struct vm_capability *)data;
557                 error = vm_set_capability(sc->vm, vmcap->cpuid,
558                                           vmcap->captype,
559                                           vmcap->capval);
560                 break;
561         case VM_SET_X2APIC_STATE:
562                 x2apic = (struct vm_x2apic *)data;
563                 error = vm_set_x2apic_state(sc->vm,
564                                             x2apic->cpuid, x2apic->state);
565                 break;
566         case VM_GET_X2APIC_STATE:
567                 x2apic = (struct vm_x2apic *)data;
568                 error = vm_get_x2apic_state(sc->vm,
569                                             x2apic->cpuid, &x2apic->state);
570                 break;
571         case VM_GET_GPA_PMAP:
572                 gpapte = (struct vm_gpa_pte *)data;
573                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
574                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
575                 error = 0;
576                 break;
577         case VM_GET_HPET_CAPABILITIES:
578                 error = vhpet_getcap((struct vm_hpet_cap *)data);
579                 break;
580         case VM_GLA2GPA: {
581                 CTASSERT(PROT_READ == VM_PROT_READ);
582                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
583                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
584                 gg = (struct vm_gla2gpa *)data;
585                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
586                     gg->prot, &gg->gpa, &gg->fault);
587                 KASSERT(error == 0 || error == EFAULT,
588                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
589                 break;
590         }
591         case VM_ACTIVATE_CPU:
592                 vac = (struct vm_activate_cpu *)data;
593                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
594                 break;
595         case VM_GET_CPUS:
596                 error = 0;
597                 vm_cpuset = (struct vm_cpuset *)data;
598                 size = vm_cpuset->cpusetsize;
599                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
600                         error = ERANGE;
601                         break;
602                 }
603                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
604                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
605                         *cpuset = vm_active_cpus(sc->vm);
606                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
607                         *cpuset = vm_suspended_cpus(sc->vm);
608                 else
609                         error = EINVAL;
610                 if (error == 0)
611                         error = copyout(cpuset, vm_cpuset->cpus, size);
612                 free(cpuset, M_TEMP);
613                 break;
614         case VM_SET_INTINFO:
615                 vmii = (struct vm_intinfo *)data;
616                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
617                 break;
618         case VM_GET_INTINFO:
619                 vmii = (struct vm_intinfo *)data;
620                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
621                     &vmii->info2);
622                 break;
623         case VM_RTC_WRITE:
624                 rtcdata = (struct vm_rtc_data *)data;
625                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
626                     rtcdata->value);
627                 break;
628         case VM_RTC_READ:
629                 rtcdata = (struct vm_rtc_data *)data;
630                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
631                     &rtcdata->value);
632                 break;
633         case VM_RTC_SETTIME:
634                 rtctime = (struct vm_rtc_time *)data;
635                 error = vrtc_set_time(sc->vm, rtctime->secs);
636                 break;
637         case VM_RTC_GETTIME:
638                 error = 0;
639                 rtctime = (struct vm_rtc_time *)data;
640                 rtctime->secs = vrtc_get_time(sc->vm);
641                 break;
642         case VM_RESTART_INSTRUCTION:
643                 error = vm_restart_instruction(sc->vm, vcpu);
644                 break;
645         case VM_SET_TOPOLOGY:
646                 topology = (struct vm_cpu_topology *)data;
647                 error = vm_set_topology(sc->vm, topology->sockets,
648                     topology->cores, topology->threads, topology->maxcpus);
649                 break;
650         case VM_GET_TOPOLOGY:
651                 topology = (struct vm_cpu_topology *)data;
652                 vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
653                     &topology->threads, &topology->maxcpus);
654                 error = 0;
655                 break;
656         default:
657                 error = ENOTTY;
658                 break;
659         }
660
661         if (state_changed == 1)
662                 vcpu_unlock_one(sc, vcpu);
663         else if (state_changed == 2)
664                 vcpu_unlock_all(sc);
665
666 done:
667         /* Make sure that no handler returns a bogus value like ERESTART */
668         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
669         return (error);
670 }
671
672 static int
673 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
674     struct vm_object **objp, int nprot)
675 {
676         struct vmmdev_softc *sc;
677         vm_paddr_t gpa;
678         size_t len;
679         vm_ooffset_t segoff, first, last;
680         int error, found, segid;
681         bool sysmem;
682
683         first = *offset;
684         last = first + mapsize;
685         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
686                 return (EINVAL);
687
688         sc = vmmdev_lookup2(cdev);
689         if (sc == NULL) {
690                 /* virtual machine is in the process of being created */
691                 return (EINVAL);
692         }
693
694         /*
695          * Get a read lock on the guest memory map by freezing any vcpu.
696          */
697         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
698         if (error)
699                 return (error);
700
701         gpa = 0;
702         found = 0;
703         while (!found) {
704                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
705                     NULL, NULL);
706                 if (error)
707                         break;
708
709                 if (first >= gpa && last <= gpa + len)
710                         found = 1;
711                 else
712                         gpa += len;
713         }
714
715         if (found) {
716                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
717                 KASSERT(error == 0 && *objp != NULL,
718                     ("%s: invalid memory segment %d", __func__, segid));
719                 if (sysmem) {
720                         vm_object_reference(*objp);
721                         *offset = segoff + (first - gpa);
722                 } else {
723                         error = EINVAL;
724                 }
725         }
726         vcpu_unlock_one(sc, VM_MAXCPU - 1);
727         return (error);
728 }
729
730 static void
731 vmmdev_destroy(void *arg)
732 {
733         struct vmmdev_softc *sc = arg;
734         struct devmem_softc *dsc;
735         int error;
736
737         error = vcpu_lock_all(sc);
738         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
739
740         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
741                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
742                 SLIST_REMOVE_HEAD(&sc->devmem, link);
743                 free(dsc->name, M_VMMDEV);
744                 free(dsc, M_VMMDEV);
745         }
746
747         if (sc->cdev != NULL)
748                 destroy_dev(sc->cdev);
749
750         if (sc->vm != NULL)
751                 vm_destroy(sc->vm);
752
753         if ((sc->flags & VSC_LINKED) != 0) {
754                 mtx_lock(&vmmdev_mtx);
755                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
756                 mtx_unlock(&vmmdev_mtx);
757         }
758
759         free(sc, M_VMMDEV);
760 }
761
762 static int
763 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
764 {
765         int error;
766         char buf[VM_MAX_NAMELEN];
767         struct devmem_softc *dsc;
768         struct vmmdev_softc *sc;
769         struct cdev *cdev;
770
771         strlcpy(buf, "beavis", sizeof(buf));
772         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
773         if (error != 0 || req->newptr == NULL)
774                 return (error);
775
776         mtx_lock(&vmmdev_mtx);
777         sc = vmmdev_lookup(buf);
778         if (sc == NULL || sc->cdev == NULL) {
779                 mtx_unlock(&vmmdev_mtx);
780                 return (EINVAL);
781         }
782
783         /*
784          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
785          * goes down to 0 so we should not do it again in the callback.
786          *
787          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
788          * is scheduled for destruction.
789          */
790         cdev = sc->cdev;
791         sc->cdev = NULL;                
792         mtx_unlock(&vmmdev_mtx);
793
794         /*
795          * Schedule all cdevs to be destroyed:
796          *
797          * - any new operations on the 'cdev' will return an error (ENXIO).
798          *
799          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
800          *   be destroyed and the callback will be invoked in a taskqueue
801          *   context.
802          *
803          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
804          */
805         SLIST_FOREACH(dsc, &sc->devmem, link) {
806                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
807                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
808         }
809         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
810         return (0);
811 }
812 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
813             NULL, 0, sysctl_vmm_destroy, "A", NULL);
814
815 static struct cdevsw vmmdevsw = {
816         .d_name         = "vmmdev",
817         .d_version      = D_VERSION,
818         .d_ioctl        = vmmdev_ioctl,
819         .d_mmap_single  = vmmdev_mmap_single,
820         .d_read         = vmmdev_rw,
821         .d_write        = vmmdev_rw,
822 };
823
824 static int
825 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
826 {
827         int error;
828         struct vm *vm;
829         struct cdev *cdev;
830         struct vmmdev_softc *sc, *sc2;
831         char buf[VM_MAX_NAMELEN];
832
833         strlcpy(buf, "beavis", sizeof(buf));
834         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
835         if (error != 0 || req->newptr == NULL)
836                 return (error);
837
838         mtx_lock(&vmmdev_mtx);
839         sc = vmmdev_lookup(buf);
840         mtx_unlock(&vmmdev_mtx);
841         if (sc != NULL)
842                 return (EEXIST);
843
844         error = vm_create(buf, &vm);
845         if (error != 0)
846                 return (error);
847
848         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
849         sc->vm = vm;
850         SLIST_INIT(&sc->devmem);
851
852         /*
853          * Lookup the name again just in case somebody sneaked in when we
854          * dropped the lock.
855          */
856         mtx_lock(&vmmdev_mtx);
857         sc2 = vmmdev_lookup(buf);
858         if (sc2 == NULL) {
859                 SLIST_INSERT_HEAD(&head, sc, link);
860                 sc->flags |= VSC_LINKED;
861         }
862         mtx_unlock(&vmmdev_mtx);
863
864         if (sc2 != NULL) {
865                 vmmdev_destroy(sc);
866                 return (EEXIST);
867         }
868
869         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
870                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
871         if (error != 0) {
872                 vmmdev_destroy(sc);
873                 return (error);
874         }
875
876         mtx_lock(&vmmdev_mtx);
877         sc->cdev = cdev;
878         sc->cdev->si_drv1 = sc;
879         mtx_unlock(&vmmdev_mtx);
880
881         return (0);
882 }
883 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
884             NULL, 0, sysctl_vmm_create, "A", NULL);
885
886 void
887 vmmdev_init(void)
888 {
889         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
890 }
891
892 int
893 vmmdev_cleanup(void)
894 {
895         int error;
896
897         if (SLIST_EMPTY(&head))
898                 error = 0;
899         else
900                 error = EBUSY;
901
902         return (error);
903 }
904
905 static int
906 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
907     struct vm_object **objp, int nprot)
908 {
909         struct devmem_softc *dsc;
910         vm_ooffset_t first, last;
911         size_t seglen;
912         int error;
913         bool sysmem;
914
915         dsc = cdev->si_drv1;
916         if (dsc == NULL) {
917                 /* 'cdev' has been created but is not ready for use */
918                 return (ENXIO);
919         }
920
921         first = *offset;
922         last = *offset + len;
923         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
924                 return (EINVAL);
925
926         error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
927         if (error)
928                 return (error);
929
930         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
931         KASSERT(error == 0 && !sysmem && *objp != NULL,
932             ("%s: invalid devmem segment %d", __func__, dsc->segid));
933
934         vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
935
936         if (seglen >= last) {
937                 vm_object_reference(*objp);
938                 return (0);
939         } else {
940                 return (EINVAL);
941         }
942 }
943
944 static struct cdevsw devmemsw = {
945         .d_name         = "devmem",
946         .d_version      = D_VERSION,
947         .d_mmap_single  = devmem_mmap_single,
948 };
949
950 static int
951 devmem_create_cdev(const char *vmname, int segid, char *devname)
952 {
953         struct devmem_softc *dsc;
954         struct vmmdev_softc *sc;
955         struct cdev *cdev;
956         int error;
957
958         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
959             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
960         if (error)
961                 return (error);
962
963         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
964
965         mtx_lock(&vmmdev_mtx);
966         sc = vmmdev_lookup(vmname);
967         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
968         if (sc->cdev == NULL) {
969                 /* virtual machine is being created or destroyed */
970                 mtx_unlock(&vmmdev_mtx);
971                 free(dsc, M_VMMDEV);
972                 destroy_dev_sched_cb(cdev, NULL, 0);
973                 return (ENODEV);
974         }
975
976         dsc->segid = segid;
977         dsc->name = devname;
978         dsc->cdev = cdev;
979         dsc->sc = sc;
980         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
981         mtx_unlock(&vmmdev_mtx);
982
983         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
984         cdev->si_drv1 = dsc;
985         return (0);
986 }
987
988 static void
989 devmem_destroy(void *arg)
990 {
991         struct devmem_softc *dsc = arg;
992
993         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
994         dsc->cdev = NULL;
995         dsc->sc = NULL;
996 }