]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
Fix file descriptor reference count leak.
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/queue.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/malloc.h>
38 #include <sys/conf.h>
39 #include <sys/sysctl.h>
40 #include <sys/libkern.h>
41 #include <sys/ioccom.h>
42 #include <sys/mman.h>
43 #include <sys/uio.h>
44
45 #include <vm/vm.h>
46 #include <vm/pmap.h>
47 #include <vm/vm_map.h>
48 #include <vm/vm_object.h>
49
50 #include <machine/vmparam.h>
51 #include <machine/vmm.h>
52 #include <machine/vmm_instruction_emul.h>
53 #include <machine/vmm_dev.h>
54
55 #include "vmm_lapic.h"
56 #include "vmm_stat.h"
57 #include "vmm_mem.h"
58 #include "io/ppt.h"
59 #include "io/vatpic.h"
60 #include "io/vioapic.h"
61 #include "io/vhpet.h"
62 #include "io/vrtc.h"
63
64 struct devmem_softc {
65         int     segid;
66         char    *name;
67         struct cdev *cdev;
68         struct vmmdev_softc *sc;
69         SLIST_ENTRY(devmem_softc) link;
70 };
71
72 struct vmmdev_softc {
73         struct vm       *vm;            /* vm instance cookie */
74         struct cdev     *cdev;
75         SLIST_ENTRY(vmmdev_softc) link;
76         SLIST_HEAD(, devmem_softc) devmem;
77         int             flags;
78 };
79 #define VSC_LINKED              0x01
80
81 static SLIST_HEAD(, vmmdev_softc) head;
82
83 static struct mtx vmmdev_mtx;
84
85 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
86
87 SYSCTL_DECL(_hw_vmm);
88
89 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
90 static void devmem_destroy(void *arg);
91
92 static int
93 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
94 {
95         int error;
96
97         if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
98                 return (EINVAL);
99
100         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
101         return (error);
102 }
103
104 static void
105 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
106 {
107         enum vcpu_state state;
108
109         state = vcpu_get_state(sc->vm, vcpu, NULL);
110         if (state != VCPU_FROZEN) {
111                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
112                     vcpu, state);
113         }
114
115         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
116 }
117
118 static int
119 vcpu_lock_all(struct vmmdev_softc *sc)
120 {
121         int error, vcpu;
122         uint16_t maxcpus;
123
124         maxcpus = vm_get_maxcpus(sc->vm);
125         for (vcpu = 0; vcpu < maxcpus; vcpu++) {
126                 error = vcpu_lock_one(sc, vcpu);
127                 if (error)
128                         break;
129         }
130
131         if (error) {
132                 while (--vcpu >= 0)
133                         vcpu_unlock_one(sc, vcpu);
134         }
135
136         return (error);
137 }
138
139 static void
140 vcpu_unlock_all(struct vmmdev_softc *sc)
141 {
142         int vcpu;
143         uint16_t maxcpus;
144
145         maxcpus = vm_get_maxcpus(sc->vm);
146         for (vcpu = 0; vcpu < maxcpus; vcpu++)
147                 vcpu_unlock_one(sc, vcpu);
148 }
149
150 static struct vmmdev_softc *
151 vmmdev_lookup(const char *name)
152 {
153         struct vmmdev_softc *sc;
154
155 #ifdef notyet   /* XXX kernel is not compiled with invariants */
156         mtx_assert(&vmmdev_mtx, MA_OWNED);
157 #endif
158
159         SLIST_FOREACH(sc, &head, link) {
160                 if (strcmp(name, vm_name(sc->vm)) == 0)
161                         break;
162         }
163
164         return (sc);
165 }
166
167 static struct vmmdev_softc *
168 vmmdev_lookup2(struct cdev *cdev)
169 {
170
171         return (cdev->si_drv1);
172 }
173
174 static int
175 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
176 {
177         int error, off, c, prot;
178         vm_paddr_t gpa, maxaddr;
179         void *hpa, *cookie;
180         struct vmmdev_softc *sc;
181         uint16_t lastcpu;
182
183         sc = vmmdev_lookup2(cdev);
184         if (sc == NULL)
185                 return (ENXIO);
186
187         /*
188          * Get a read lock on the guest memory map by freezing any vcpu.
189          */
190         lastcpu = vm_get_maxcpus(sc->vm) - 1;
191         error = vcpu_lock_one(sc, lastcpu);
192         if (error)
193                 return (error);
194
195         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
196         maxaddr = vmm_sysmem_maxaddr(sc->vm);
197         while (uio->uio_resid > 0 && error == 0) {
198                 gpa = uio->uio_offset;
199                 off = gpa & PAGE_MASK;
200                 c = min(uio->uio_resid, PAGE_SIZE - off);
201
202                 /*
203                  * The VM has a hole in its physical memory map. If we want to
204                  * use 'dd' to inspect memory beyond the hole we need to
205                  * provide bogus data for memory that lies in the hole.
206                  *
207                  * Since this device does not support lseek(2), dd(1) will
208                  * read(2) blocks of data to simulate the lseek(2).
209                  */
210                 hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
211                     prot, &cookie);
212                 if (hpa == NULL) {
213                         if (uio->uio_rw == UIO_READ && gpa < maxaddr)
214                                 error = uiomove(__DECONST(void *, zero_region),
215                                     c, uio);
216                         else
217                                 error = EFAULT;
218                 } else {
219                         error = uiomove(hpa, c, uio);
220                         vm_gpa_release(cookie);
221                 }
222         }
223         vcpu_unlock_one(sc, lastcpu);
224         return (error);
225 }
226
227 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
228
229 static int
230 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
231 {
232         struct devmem_softc *dsc;
233         int error;
234         bool sysmem;
235
236         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
237         if (error || mseg->len == 0)
238                 return (error);
239
240         if (!sysmem) {
241                 SLIST_FOREACH(dsc, &sc->devmem, link) {
242                         if (dsc->segid == mseg->segid)
243                                 break;
244                 }
245                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
246                     __func__, mseg->segid));
247                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
248         } else {
249                 bzero(mseg->name, sizeof(mseg->name));
250         }
251
252         return (error);
253 }
254
255 static int
256 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
257 {
258         char *name;
259         int error;
260         bool sysmem;
261
262         error = 0;
263         name = NULL;
264         sysmem = true;
265
266         if (VM_MEMSEG_NAME(mseg)) {
267                 sysmem = false;
268                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
269                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
270                 if (error)
271                         goto done;
272         }
273
274         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
275         if (error)
276                 goto done;
277
278         if (VM_MEMSEG_NAME(mseg)) {
279                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
280                 if (error)
281                         vm_free_memseg(sc->vm, mseg->segid);
282                 else
283                         name = NULL;    /* freed when 'cdev' is destroyed */
284         }
285 done:
286         free(name, M_VMMDEV);
287         return (error);
288 }
289
290 static int
291 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
292              struct thread *td)
293 {
294         int error, vcpu, state_changed, size;
295         cpuset_t *cpuset;
296         struct vmmdev_softc *sc;
297         struct vm_register *vmreg;
298         struct vm_seg_desc *vmsegdesc;
299         struct vm_run *vmrun;
300         struct vm_exception *vmexc;
301         struct vm_lapic_irq *vmirq;
302         struct vm_lapic_msi *vmmsi;
303         struct vm_ioapic_irq *ioapic_irq;
304         struct vm_isa_irq *isa_irq;
305         struct vm_isa_irq_trigger *isa_irq_trigger;
306         struct vm_capability *vmcap;
307         struct vm_pptdev *pptdev;
308         struct vm_pptdev_mmio *pptmmio;
309         struct vm_pptdev_msi *pptmsi;
310         struct vm_pptdev_msix *pptmsix;
311         struct vm_nmi *vmnmi;
312         struct vm_stats *vmstats;
313         struct vm_stat_desc *statdesc;
314         struct vm_x2apic *x2apic;
315         struct vm_gpa_pte *gpapte;
316         struct vm_suspend *vmsuspend;
317         struct vm_gla2gpa *gg;
318         struct vm_activate_cpu *vac;
319         struct vm_cpuset *vm_cpuset;
320         struct vm_intinfo *vmii;
321         struct vm_rtc_time *rtctime;
322         struct vm_rtc_data *rtcdata;
323         struct vm_memmap *mm;
324         struct vm_cpu_topology *topology;
325
326         sc = vmmdev_lookup2(cdev);
327         if (sc == NULL)
328                 return (ENXIO);
329
330         error = 0;
331         vcpu = -1;
332         state_changed = 0;
333
334         /*
335          * Some VMM ioctls can operate only on vcpus that are not running.
336          */
337         switch (cmd) {
338         case VM_RUN:
339         case VM_GET_REGISTER:
340         case VM_SET_REGISTER:
341         case VM_GET_SEGMENT_DESCRIPTOR:
342         case VM_SET_SEGMENT_DESCRIPTOR:
343         case VM_INJECT_EXCEPTION:
344         case VM_GET_CAPABILITY:
345         case VM_SET_CAPABILITY:
346         case VM_PPTDEV_MSI:
347         case VM_PPTDEV_MSIX:
348         case VM_SET_X2APIC_STATE:
349         case VM_GLA2GPA:
350         case VM_ACTIVATE_CPU:
351         case VM_SET_INTINFO:
352         case VM_GET_INTINFO:
353         case VM_RESTART_INSTRUCTION:
354                 /*
355                  * XXX fragile, handle with care
356                  * Assumes that the first field of the ioctl data is the vcpu.
357                  */
358                 vcpu = *(int *)data;
359                 error = vcpu_lock_one(sc, vcpu);
360                 if (error)
361                         goto done;
362                 state_changed = 1;
363                 break;
364
365         case VM_MAP_PPTDEV_MMIO:
366         case VM_BIND_PPTDEV:
367         case VM_UNBIND_PPTDEV:
368         case VM_ALLOC_MEMSEG:
369         case VM_MMAP_MEMSEG:
370         case VM_REINIT:
371                 /*
372                  * ioctls that operate on the entire virtual machine must
373                  * prevent all vcpus from running.
374                  */
375                 error = vcpu_lock_all(sc);
376                 if (error)
377                         goto done;
378                 state_changed = 2;
379                 break;
380
381         case VM_GET_MEMSEG:
382         case VM_MMAP_GETNEXT:
383                 /*
384                  * Lock a vcpu to make sure that the memory map cannot be
385                  * modified while it is being inspected.
386                  */
387                 vcpu = vm_get_maxcpus(sc->vm) - 1;
388                 error = vcpu_lock_one(sc, vcpu);
389                 if (error)
390                         goto done;
391                 state_changed = 1;
392                 break;
393
394         default:
395                 break;
396         }
397
398         switch(cmd) {
399         case VM_RUN:
400                 vmrun = (struct vm_run *)data;
401                 error = vm_run(sc->vm, vmrun);
402                 break;
403         case VM_SUSPEND:
404                 vmsuspend = (struct vm_suspend *)data;
405                 error = vm_suspend(sc->vm, vmsuspend->how);
406                 break;
407         case VM_REINIT:
408                 error = vm_reinit(sc->vm);
409                 break;
410         case VM_STAT_DESC: {
411                 statdesc = (struct vm_stat_desc *)data;
412                 error = vmm_stat_desc_copy(statdesc->index,
413                                         statdesc->desc, sizeof(statdesc->desc));
414                 break;
415         }
416         case VM_STATS: {
417                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
418                 vmstats = (struct vm_stats *)data;
419                 getmicrotime(&vmstats->tv);
420                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
421                                       &vmstats->num_entries, vmstats->statbuf);
422                 break;
423         }
424         case VM_PPTDEV_MSI:
425                 pptmsi = (struct vm_pptdev_msi *)data;
426                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
427                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
428                                       pptmsi->addr, pptmsi->msg,
429                                       pptmsi->numvec);
430                 break;
431         case VM_PPTDEV_MSIX:
432                 pptmsix = (struct vm_pptdev_msix *)data;
433                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
434                                        pptmsix->bus, pptmsix->slot, 
435                                        pptmsix->func, pptmsix->idx,
436                                        pptmsix->addr, pptmsix->msg,
437                                        pptmsix->vector_control);
438                 break;
439         case VM_MAP_PPTDEV_MMIO:
440                 pptmmio = (struct vm_pptdev_mmio *)data;
441                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
442                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
443                                      pptmmio->hpa);
444                 break;
445         case VM_BIND_PPTDEV:
446                 pptdev = (struct vm_pptdev *)data;
447                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
448                                          pptdev->func);
449                 break;
450         case VM_UNBIND_PPTDEV:
451                 pptdev = (struct vm_pptdev *)data;
452                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
453                                            pptdev->func);
454                 break;
455         case VM_INJECT_EXCEPTION:
456                 vmexc = (struct vm_exception *)data;
457                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
458                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
459                     vmexc->restart_instruction);
460                 break;
461         case VM_INJECT_NMI:
462                 vmnmi = (struct vm_nmi *)data;
463                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
464                 break;
465         case VM_LAPIC_IRQ:
466                 vmirq = (struct vm_lapic_irq *)data;
467                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
468                 break;
469         case VM_LAPIC_LOCAL_IRQ:
470                 vmirq = (struct vm_lapic_irq *)data;
471                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
472                     vmirq->vector);
473                 break;
474         case VM_LAPIC_MSI:
475                 vmmsi = (struct vm_lapic_msi *)data;
476                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
477                 break;
478         case VM_IOAPIC_ASSERT_IRQ:
479                 ioapic_irq = (struct vm_ioapic_irq *)data;
480                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
481                 break;
482         case VM_IOAPIC_DEASSERT_IRQ:
483                 ioapic_irq = (struct vm_ioapic_irq *)data;
484                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
485                 break;
486         case VM_IOAPIC_PULSE_IRQ:
487                 ioapic_irq = (struct vm_ioapic_irq *)data;
488                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
489                 break;
490         case VM_IOAPIC_PINCOUNT:
491                 *(int *)data = vioapic_pincount(sc->vm);
492                 break;
493         case VM_ISA_ASSERT_IRQ:
494                 isa_irq = (struct vm_isa_irq *)data;
495                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
496                 if (error == 0 && isa_irq->ioapic_irq != -1)
497                         error = vioapic_assert_irq(sc->vm,
498                             isa_irq->ioapic_irq);
499                 break;
500         case VM_ISA_DEASSERT_IRQ:
501                 isa_irq = (struct vm_isa_irq *)data;
502                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
503                 if (error == 0 && isa_irq->ioapic_irq != -1)
504                         error = vioapic_deassert_irq(sc->vm,
505                             isa_irq->ioapic_irq);
506                 break;
507         case VM_ISA_PULSE_IRQ:
508                 isa_irq = (struct vm_isa_irq *)data;
509                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
510                 if (error == 0 && isa_irq->ioapic_irq != -1)
511                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
512                 break;
513         case VM_ISA_SET_IRQ_TRIGGER:
514                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
515                 error = vatpic_set_irq_trigger(sc->vm,
516                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
517                 break;
518         case VM_MMAP_GETNEXT:
519                 mm = (struct vm_memmap *)data;
520                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
521                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
522                 break;
523         case VM_MMAP_MEMSEG:
524                 mm = (struct vm_memmap *)data;
525                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
526                     mm->len, mm->prot, mm->flags);
527                 break;
528         case VM_ALLOC_MEMSEG:
529                 error = alloc_memseg(sc, (struct vm_memseg *)data);
530                 break;
531         case VM_GET_MEMSEG:
532                 error = get_memseg(sc, (struct vm_memseg *)data);
533                 break;
534         case VM_GET_REGISTER:
535                 vmreg = (struct vm_register *)data;
536                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
537                                         &vmreg->regval);
538                 break;
539         case VM_SET_REGISTER:
540                 vmreg = (struct vm_register *)data;
541                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
542                                         vmreg->regval);
543                 break;
544         case VM_SET_SEGMENT_DESCRIPTOR:
545                 vmsegdesc = (struct vm_seg_desc *)data;
546                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
547                                         vmsegdesc->regnum,
548                                         &vmsegdesc->desc);
549                 break;
550         case VM_GET_SEGMENT_DESCRIPTOR:
551                 vmsegdesc = (struct vm_seg_desc *)data;
552                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
553                                         vmsegdesc->regnum,
554                                         &vmsegdesc->desc);
555                 break;
556         case VM_GET_CAPABILITY:
557                 vmcap = (struct vm_capability *)data;
558                 error = vm_get_capability(sc->vm, vmcap->cpuid,
559                                           vmcap->captype,
560                                           &vmcap->capval);
561                 break;
562         case VM_SET_CAPABILITY:
563                 vmcap = (struct vm_capability *)data;
564                 error = vm_set_capability(sc->vm, vmcap->cpuid,
565                                           vmcap->captype,
566                                           vmcap->capval);
567                 break;
568         case VM_SET_X2APIC_STATE:
569                 x2apic = (struct vm_x2apic *)data;
570                 error = vm_set_x2apic_state(sc->vm,
571                                             x2apic->cpuid, x2apic->state);
572                 break;
573         case VM_GET_X2APIC_STATE:
574                 x2apic = (struct vm_x2apic *)data;
575                 error = vm_get_x2apic_state(sc->vm,
576                                             x2apic->cpuid, &x2apic->state);
577                 break;
578         case VM_GET_GPA_PMAP:
579                 gpapte = (struct vm_gpa_pte *)data;
580                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
581                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
582                 error = 0;
583                 break;
584         case VM_GET_HPET_CAPABILITIES:
585                 error = vhpet_getcap((struct vm_hpet_cap *)data);
586                 break;
587         case VM_GLA2GPA: {
588                 CTASSERT(PROT_READ == VM_PROT_READ);
589                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
590                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
591                 gg = (struct vm_gla2gpa *)data;
592                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
593                     gg->prot, &gg->gpa, &gg->fault);
594                 KASSERT(error == 0 || error == EFAULT,
595                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
596                 break;
597         }
598         case VM_ACTIVATE_CPU:
599                 vac = (struct vm_activate_cpu *)data;
600                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
601                 break;
602         case VM_GET_CPUS:
603                 error = 0;
604                 vm_cpuset = (struct vm_cpuset *)data;
605                 size = vm_cpuset->cpusetsize;
606                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
607                         error = ERANGE;
608                         break;
609                 }
610                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
611                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
612                         *cpuset = vm_active_cpus(sc->vm);
613                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
614                         *cpuset = vm_suspended_cpus(sc->vm);
615                 else
616                         error = EINVAL;
617                 if (error == 0)
618                         error = copyout(cpuset, vm_cpuset->cpus, size);
619                 free(cpuset, M_TEMP);
620                 break;
621         case VM_SET_INTINFO:
622                 vmii = (struct vm_intinfo *)data;
623                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
624                 break;
625         case VM_GET_INTINFO:
626                 vmii = (struct vm_intinfo *)data;
627                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
628                     &vmii->info2);
629                 break;
630         case VM_RTC_WRITE:
631                 rtcdata = (struct vm_rtc_data *)data;
632                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
633                     rtcdata->value);
634                 break;
635         case VM_RTC_READ:
636                 rtcdata = (struct vm_rtc_data *)data;
637                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
638                     &rtcdata->value);
639                 break;
640         case VM_RTC_SETTIME:
641                 rtctime = (struct vm_rtc_time *)data;
642                 error = vrtc_set_time(sc->vm, rtctime->secs);
643                 break;
644         case VM_RTC_GETTIME:
645                 error = 0;
646                 rtctime = (struct vm_rtc_time *)data;
647                 rtctime->secs = vrtc_get_time(sc->vm);
648                 break;
649         case VM_RESTART_INSTRUCTION:
650                 error = vm_restart_instruction(sc->vm, vcpu);
651                 break;
652         case VM_SET_TOPOLOGY:
653                 topology = (struct vm_cpu_topology *)data;
654                 error = vm_set_topology(sc->vm, topology->sockets,
655                     topology->cores, topology->threads, topology->maxcpus);
656                 break;
657         case VM_GET_TOPOLOGY:
658                 topology = (struct vm_cpu_topology *)data;
659                 vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
660                     &topology->threads, &topology->maxcpus);
661                 error = 0;
662                 break;
663         default:
664                 error = ENOTTY;
665                 break;
666         }
667
668         if (state_changed == 1)
669                 vcpu_unlock_one(sc, vcpu);
670         else if (state_changed == 2)
671                 vcpu_unlock_all(sc);
672
673 done:
674         /* Make sure that no handler returns a bogus value like ERESTART */
675         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
676         return (error);
677 }
678
679 static int
680 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
681     struct vm_object **objp, int nprot)
682 {
683         struct vmmdev_softc *sc;
684         vm_paddr_t gpa;
685         size_t len;
686         vm_ooffset_t segoff, first, last;
687         int error, found, segid;
688         uint16_t lastcpu;
689         bool sysmem;
690
691         first = *offset;
692         last = first + mapsize;
693         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
694                 return (EINVAL);
695
696         sc = vmmdev_lookup2(cdev);
697         if (sc == NULL) {
698                 /* virtual machine is in the process of being created */
699                 return (EINVAL);
700         }
701
702         /*
703          * Get a read lock on the guest memory map by freezing any vcpu.
704          */
705         lastcpu = vm_get_maxcpus(sc->vm) - 1;
706         error = vcpu_lock_one(sc, lastcpu);
707         if (error)
708                 return (error);
709
710         gpa = 0;
711         found = 0;
712         while (!found) {
713                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
714                     NULL, NULL);
715                 if (error)
716                         break;
717
718                 if (first >= gpa && last <= gpa + len)
719                         found = 1;
720                 else
721                         gpa += len;
722         }
723
724         if (found) {
725                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
726                 KASSERT(error == 0 && *objp != NULL,
727                     ("%s: invalid memory segment %d", __func__, segid));
728                 if (sysmem) {
729                         vm_object_reference(*objp);
730                         *offset = segoff + (first - gpa);
731                 } else {
732                         error = EINVAL;
733                 }
734         }
735         vcpu_unlock_one(sc, lastcpu);
736         return (error);
737 }
738
739 static void
740 vmmdev_destroy(void *arg)
741 {
742         struct vmmdev_softc *sc = arg;
743         struct devmem_softc *dsc;
744         int error;
745
746         error = vcpu_lock_all(sc);
747         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
748
749         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
750                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
751                 SLIST_REMOVE_HEAD(&sc->devmem, link);
752                 free(dsc->name, M_VMMDEV);
753                 free(dsc, M_VMMDEV);
754         }
755
756         if (sc->cdev != NULL)
757                 destroy_dev(sc->cdev);
758
759         if (sc->vm != NULL)
760                 vm_destroy(sc->vm);
761
762         if ((sc->flags & VSC_LINKED) != 0) {
763                 mtx_lock(&vmmdev_mtx);
764                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
765                 mtx_unlock(&vmmdev_mtx);
766         }
767
768         free(sc, M_VMMDEV);
769 }
770
771 static int
772 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
773 {
774         int error;
775         char buf[VM_MAX_NAMELEN];
776         struct devmem_softc *dsc;
777         struct vmmdev_softc *sc;
778         struct cdev *cdev;
779
780         strlcpy(buf, "beavis", sizeof(buf));
781         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
782         if (error != 0 || req->newptr == NULL)
783                 return (error);
784
785         mtx_lock(&vmmdev_mtx);
786         sc = vmmdev_lookup(buf);
787         if (sc == NULL || sc->cdev == NULL) {
788                 mtx_unlock(&vmmdev_mtx);
789                 return (EINVAL);
790         }
791
792         /*
793          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
794          * goes down to 0 so we should not do it again in the callback.
795          *
796          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
797          * is scheduled for destruction.
798          */
799         cdev = sc->cdev;
800         sc->cdev = NULL;                
801         mtx_unlock(&vmmdev_mtx);
802
803         /*
804          * Schedule all cdevs to be destroyed:
805          *
806          * - any new operations on the 'cdev' will return an error (ENXIO).
807          *
808          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
809          *   be destroyed and the callback will be invoked in a taskqueue
810          *   context.
811          *
812          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
813          */
814         SLIST_FOREACH(dsc, &sc->devmem, link) {
815                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
816                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
817         }
818         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
819         return (0);
820 }
821 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
822             NULL, 0, sysctl_vmm_destroy, "A", NULL);
823
824 static struct cdevsw vmmdevsw = {
825         .d_name         = "vmmdev",
826         .d_version      = D_VERSION,
827         .d_ioctl        = vmmdev_ioctl,
828         .d_mmap_single  = vmmdev_mmap_single,
829         .d_read         = vmmdev_rw,
830         .d_write        = vmmdev_rw,
831 };
832
833 static int
834 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
835 {
836         int error;
837         struct vm *vm;
838         struct cdev *cdev;
839         struct vmmdev_softc *sc, *sc2;
840         char buf[VM_MAX_NAMELEN];
841
842         strlcpy(buf, "beavis", sizeof(buf));
843         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
844         if (error != 0 || req->newptr == NULL)
845                 return (error);
846
847         mtx_lock(&vmmdev_mtx);
848         sc = vmmdev_lookup(buf);
849         mtx_unlock(&vmmdev_mtx);
850         if (sc != NULL)
851                 return (EEXIST);
852
853         error = vm_create(buf, &vm);
854         if (error != 0)
855                 return (error);
856
857         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
858         sc->vm = vm;
859         SLIST_INIT(&sc->devmem);
860
861         /*
862          * Lookup the name again just in case somebody sneaked in when we
863          * dropped the lock.
864          */
865         mtx_lock(&vmmdev_mtx);
866         sc2 = vmmdev_lookup(buf);
867         if (sc2 == NULL) {
868                 SLIST_INSERT_HEAD(&head, sc, link);
869                 sc->flags |= VSC_LINKED;
870         }
871         mtx_unlock(&vmmdev_mtx);
872
873         if (sc2 != NULL) {
874                 vmmdev_destroy(sc);
875                 return (EEXIST);
876         }
877
878         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
879                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
880         if (error != 0) {
881                 vmmdev_destroy(sc);
882                 return (error);
883         }
884
885         mtx_lock(&vmmdev_mtx);
886         sc->cdev = cdev;
887         sc->cdev->si_drv1 = sc;
888         mtx_unlock(&vmmdev_mtx);
889
890         return (0);
891 }
892 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
893             NULL, 0, sysctl_vmm_create, "A", NULL);
894
895 void
896 vmmdev_init(void)
897 {
898         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
899 }
900
901 int
902 vmmdev_cleanup(void)
903 {
904         int error;
905
906         if (SLIST_EMPTY(&head))
907                 error = 0;
908         else
909                 error = EBUSY;
910
911         return (error);
912 }
913
914 static int
915 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
916     struct vm_object **objp, int nprot)
917 {
918         struct devmem_softc *dsc;
919         vm_ooffset_t first, last;
920         size_t seglen;
921         int error;
922         uint16_t lastcpu;
923         bool sysmem;
924
925         dsc = cdev->si_drv1;
926         if (dsc == NULL) {
927                 /* 'cdev' has been created but is not ready for use */
928                 return (ENXIO);
929         }
930
931         first = *offset;
932         last = *offset + len;
933         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
934                 return (EINVAL);
935
936         lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
937         error = vcpu_lock_one(dsc->sc, lastcpu);
938         if (error)
939                 return (error);
940
941         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
942         KASSERT(error == 0 && !sysmem && *objp != NULL,
943             ("%s: invalid devmem segment %d", __func__, dsc->segid));
944
945         vcpu_unlock_one(dsc->sc, lastcpu);
946
947         if (seglen >= last) {
948                 vm_object_reference(*objp);
949                 return (0);
950         } else {
951                 return (EINVAL);
952         }
953 }
954
955 static struct cdevsw devmemsw = {
956         .d_name         = "devmem",
957         .d_version      = D_VERSION,
958         .d_mmap_single  = devmem_mmap_single,
959 };
960
961 static int
962 devmem_create_cdev(const char *vmname, int segid, char *devname)
963 {
964         struct devmem_softc *dsc;
965         struct vmmdev_softc *sc;
966         struct cdev *cdev;
967         int error;
968
969         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
970             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
971         if (error)
972                 return (error);
973
974         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
975
976         mtx_lock(&vmmdev_mtx);
977         sc = vmmdev_lookup(vmname);
978         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
979         if (sc->cdev == NULL) {
980                 /* virtual machine is being created or destroyed */
981                 mtx_unlock(&vmmdev_mtx);
982                 free(dsc, M_VMMDEV);
983                 destroy_dev_sched_cb(cdev, NULL, 0);
984                 return (ENODEV);
985         }
986
987         dsc->segid = segid;
988         dsc->name = devname;
989         dsc->cdev = cdev;
990         dsc->sc = sc;
991         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
992         mtx_unlock(&vmmdev_mtx);
993
994         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
995         cdev->si_drv1 = dsc;
996         return (0);
997 }
998
999 static void
1000 devmem_destroy(void *arg)
1001 {
1002         struct devmem_softc *dsc = arg;
1003
1004         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1005         dsc->cdev = NULL;
1006         dsc->sc = NULL;
1007 }