]> CyberLeo.Net >> Repos - FreeBSD/releng/10.3.git/blob - sys/amd64/vmm/vmm_dev.c
- Copy stable/10@296371 to releng/10.3 in preparation for 10.3-RC1
[FreeBSD/releng/10.3.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/queue.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/malloc.h>
38 #include <sys/conf.h>
39 #include <sys/sysctl.h>
40 #include <sys/libkern.h>
41 #include <sys/ioccom.h>
42 #include <sys/mman.h>
43 #include <sys/uio.h>
44
45 #include <vm/vm.h>
46 #include <vm/pmap.h>
47 #include <vm/vm_map.h>
48 #include <vm/vm_object.h>
49
50 #include <machine/vmparam.h>
51 #include <machine/vmm.h>
52 #include <machine/vmm_instruction_emul.h>
53 #include <machine/vmm_dev.h>
54
55 #include "vmm_lapic.h"
56 #include "vmm_stat.h"
57 #include "vmm_mem.h"
58 #include "io/ppt.h"
59 #include "io/vatpic.h"
60 #include "io/vioapic.h"
61 #include "io/vhpet.h"
62 #include "io/vrtc.h"
63
64 struct devmem_softc {
65         int     segid;
66         char    *name;
67         struct cdev *cdev;
68         struct vmmdev_softc *sc;
69         SLIST_ENTRY(devmem_softc) link;
70 };
71
72 struct vmmdev_softc {
73         struct vm       *vm;            /* vm instance cookie */
74         struct cdev     *cdev;
75         SLIST_ENTRY(vmmdev_softc) link;
76         SLIST_HEAD(, devmem_softc) devmem;
77         int             flags;
78 };
79 #define VSC_LINKED              0x01
80
81 static SLIST_HEAD(, vmmdev_softc) head;
82
83 static struct mtx vmmdev_mtx;
84
85 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
86
87 SYSCTL_DECL(_hw_vmm);
88
89 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
90 static void devmem_destroy(void *arg);
91
92 static int
93 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
94 {
95         int error;
96
97         if (vcpu < 0 || vcpu >= VM_MAXCPU)
98                 return (EINVAL);
99
100         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
101         return (error);
102 }
103
104 static void
105 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
106 {
107         enum vcpu_state state;
108
109         state = vcpu_get_state(sc->vm, vcpu, NULL);
110         if (state != VCPU_FROZEN) {
111                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
112                     vcpu, state);
113         }
114
115         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
116 }
117
118 static int
119 vcpu_lock_all(struct vmmdev_softc *sc)
120 {
121         int error, vcpu;
122
123         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
124                 error = vcpu_lock_one(sc, vcpu);
125                 if (error)
126                         break;
127         }
128
129         if (error) {
130                 while (--vcpu >= 0)
131                         vcpu_unlock_one(sc, vcpu);
132         }
133
134         return (error);
135 }
136
137 static void
138 vcpu_unlock_all(struct vmmdev_softc *sc)
139 {
140         int vcpu;
141
142         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
143                 vcpu_unlock_one(sc, vcpu);
144 }
145
146 static struct vmmdev_softc *
147 vmmdev_lookup(const char *name)
148 {
149         struct vmmdev_softc *sc;
150
151 #ifdef notyet   /* XXX kernel is not compiled with invariants */
152         mtx_assert(&vmmdev_mtx, MA_OWNED);
153 #endif
154
155         SLIST_FOREACH(sc, &head, link) {
156                 if (strcmp(name, vm_name(sc->vm)) == 0)
157                         break;
158         }
159
160         return (sc);
161 }
162
163 static struct vmmdev_softc *
164 vmmdev_lookup2(struct cdev *cdev)
165 {
166
167         return (cdev->si_drv1);
168 }
169
170 static int
171 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
172 {
173         int error, off, c, prot;
174         vm_paddr_t gpa;
175         void *hpa, *cookie;
176         struct vmmdev_softc *sc;
177
178         sc = vmmdev_lookup2(cdev);
179         if (sc == NULL)
180                 return (ENXIO);
181
182         /*
183          * Get a read lock on the guest memory map by freezing any vcpu.
184          */
185         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
186         if (error)
187                 return (error);
188
189         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
190         while (uio->uio_resid > 0 && error == 0) {
191                 gpa = uio->uio_offset;
192                 off = gpa & PAGE_MASK;
193                 c = min(uio->uio_resid, PAGE_SIZE - off);
194
195                 /*
196                  * The VM has a hole in its physical memory map. If we want to
197                  * use 'dd' to inspect memory beyond the hole we need to
198                  * provide bogus data for memory that lies in the hole.
199                  *
200                  * Since this device does not support lseek(2), dd(1) will
201                  * read(2) blocks of data to simulate the lseek(2).
202                  */
203                 hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
204                 if (hpa == NULL) {
205                         if (uio->uio_rw == UIO_READ)
206                                 error = uiomove(__DECONST(void *, zero_region),
207                                     c, uio);
208                         else
209                                 error = EFAULT;
210                 } else {
211                         error = uiomove(hpa, c, uio);
212                         vm_gpa_release(cookie);
213                 }
214         }
215         vcpu_unlock_one(sc, VM_MAXCPU - 1);
216         return (error);
217 }
218
219 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
220
221 static int
222 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
223 {
224         struct devmem_softc *dsc;
225         int error;
226         bool sysmem;
227
228         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
229         if (error || mseg->len == 0)
230                 return (error);
231
232         if (!sysmem) {
233                 SLIST_FOREACH(dsc, &sc->devmem, link) {
234                         if (dsc->segid == mseg->segid)
235                                 break;
236                 }
237                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
238                     __func__, mseg->segid));
239                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
240         } else {
241                 bzero(mseg->name, sizeof(mseg->name));
242         }
243
244         return (error);
245 }
246
247 static int
248 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
249 {
250         char *name;
251         int error;
252         bool sysmem;
253
254         error = 0;
255         name = NULL;
256         sysmem = true;
257
258         if (VM_MEMSEG_NAME(mseg)) {
259                 sysmem = false;
260                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
261                 error = copystr(VM_MEMSEG_NAME(mseg), name, SPECNAMELEN + 1, 0);
262                 if (error)
263                         goto done;
264         }
265
266         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
267         if (error)
268                 goto done;
269
270         if (VM_MEMSEG_NAME(mseg)) {
271                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
272                 if (error)
273                         vm_free_memseg(sc->vm, mseg->segid);
274                 else
275                         name = NULL;    /* freed when 'cdev' is destroyed */
276         }
277 done:
278         free(name, M_VMMDEV);
279         return (error);
280 }
281
282 static int
283 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
284              struct thread *td)
285 {
286         int error, vcpu, state_changed, size;
287         cpuset_t *cpuset;
288         struct vmmdev_softc *sc;
289         struct vm_register *vmreg;
290         struct vm_seg_desc *vmsegdesc;
291         struct vm_run *vmrun;
292         struct vm_exception *vmexc;
293         struct vm_lapic_irq *vmirq;
294         struct vm_lapic_msi *vmmsi;
295         struct vm_ioapic_irq *ioapic_irq;
296         struct vm_isa_irq *isa_irq;
297         struct vm_isa_irq_trigger *isa_irq_trigger;
298         struct vm_capability *vmcap;
299         struct vm_pptdev *pptdev;
300         struct vm_pptdev_mmio *pptmmio;
301         struct vm_pptdev_msi *pptmsi;
302         struct vm_pptdev_msix *pptmsix;
303         struct vm_nmi *vmnmi;
304         struct vm_stats *vmstats;
305         struct vm_stat_desc *statdesc;
306         struct vm_x2apic *x2apic;
307         struct vm_gpa_pte *gpapte;
308         struct vm_suspend *vmsuspend;
309         struct vm_gla2gpa *gg;
310         struct vm_activate_cpu *vac;
311         struct vm_cpuset *vm_cpuset;
312         struct vm_intinfo *vmii;
313         struct vm_rtc_time *rtctime;
314         struct vm_rtc_data *rtcdata;
315         struct vm_memmap *mm;
316
317         sc = vmmdev_lookup2(cdev);
318         if (sc == NULL)
319                 return (ENXIO);
320
321         error = 0;
322         vcpu = -1;
323         state_changed = 0;
324
325         /*
326          * Some VMM ioctls can operate only on vcpus that are not running.
327          */
328         switch (cmd) {
329         case VM_RUN:
330         case VM_GET_REGISTER:
331         case VM_SET_REGISTER:
332         case VM_GET_SEGMENT_DESCRIPTOR:
333         case VM_SET_SEGMENT_DESCRIPTOR:
334         case VM_INJECT_EXCEPTION:
335         case VM_GET_CAPABILITY:
336         case VM_SET_CAPABILITY:
337         case VM_PPTDEV_MSI:
338         case VM_PPTDEV_MSIX:
339         case VM_SET_X2APIC_STATE:
340         case VM_GLA2GPA:
341         case VM_ACTIVATE_CPU:
342         case VM_SET_INTINFO:
343         case VM_GET_INTINFO:
344         case VM_RESTART_INSTRUCTION:
345                 /*
346                  * XXX fragile, handle with care
347                  * Assumes that the first field of the ioctl data is the vcpu.
348                  */
349                 vcpu = *(int *)data;
350                 error = vcpu_lock_one(sc, vcpu);
351                 if (error)
352                         goto done;
353                 state_changed = 1;
354                 break;
355
356         case VM_MAP_PPTDEV_MMIO:
357         case VM_BIND_PPTDEV:
358         case VM_UNBIND_PPTDEV:
359         case VM_ALLOC_MEMSEG:
360         case VM_MMAP_MEMSEG:
361         case VM_REINIT:
362                 /*
363                  * ioctls that operate on the entire virtual machine must
364                  * prevent all vcpus from running.
365                  */
366                 error = vcpu_lock_all(sc);
367                 if (error)
368                         goto done;
369                 state_changed = 2;
370                 break;
371
372         case VM_GET_MEMSEG:
373         case VM_MMAP_GETNEXT:
374                 /*
375                  * Lock a vcpu to make sure that the memory map cannot be
376                  * modified while it is being inspected.
377                  */
378                 vcpu = VM_MAXCPU - 1;
379                 error = vcpu_lock_one(sc, vcpu);
380                 if (error)
381                         goto done;
382                 state_changed = 1;
383                 break;
384
385         default:
386                 break;
387         }
388
389         switch(cmd) {
390         case VM_RUN:
391                 vmrun = (struct vm_run *)data;
392                 error = vm_run(sc->vm, vmrun);
393                 break;
394         case VM_SUSPEND:
395                 vmsuspend = (struct vm_suspend *)data;
396                 error = vm_suspend(sc->vm, vmsuspend->how);
397                 break;
398         case VM_REINIT:
399                 error = vm_reinit(sc->vm);
400                 break;
401         case VM_STAT_DESC: {
402                 statdesc = (struct vm_stat_desc *)data;
403                 error = vmm_stat_desc_copy(statdesc->index,
404                                         statdesc->desc, sizeof(statdesc->desc));
405                 break;
406         }
407         case VM_STATS: {
408                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
409                 vmstats = (struct vm_stats *)data;
410                 getmicrotime(&vmstats->tv);
411                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
412                                       &vmstats->num_entries, vmstats->statbuf);
413                 break;
414         }
415         case VM_PPTDEV_MSI:
416                 pptmsi = (struct vm_pptdev_msi *)data;
417                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
418                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
419                                       pptmsi->addr, pptmsi->msg,
420                                       pptmsi->numvec);
421                 break;
422         case VM_PPTDEV_MSIX:
423                 pptmsix = (struct vm_pptdev_msix *)data;
424                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
425                                        pptmsix->bus, pptmsix->slot, 
426                                        pptmsix->func, pptmsix->idx,
427                                        pptmsix->addr, pptmsix->msg,
428                                        pptmsix->vector_control);
429                 break;
430         case VM_MAP_PPTDEV_MMIO:
431                 pptmmio = (struct vm_pptdev_mmio *)data;
432                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
433                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
434                                      pptmmio->hpa);
435                 break;
436         case VM_BIND_PPTDEV:
437                 pptdev = (struct vm_pptdev *)data;
438                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
439                                          pptdev->func);
440                 break;
441         case VM_UNBIND_PPTDEV:
442                 pptdev = (struct vm_pptdev *)data;
443                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
444                                            pptdev->func);
445                 break;
446         case VM_INJECT_EXCEPTION:
447                 vmexc = (struct vm_exception *)data;
448                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
449                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
450                     vmexc->restart_instruction);
451                 break;
452         case VM_INJECT_NMI:
453                 vmnmi = (struct vm_nmi *)data;
454                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
455                 break;
456         case VM_LAPIC_IRQ:
457                 vmirq = (struct vm_lapic_irq *)data;
458                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
459                 break;
460         case VM_LAPIC_LOCAL_IRQ:
461                 vmirq = (struct vm_lapic_irq *)data;
462                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
463                     vmirq->vector);
464                 break;
465         case VM_LAPIC_MSI:
466                 vmmsi = (struct vm_lapic_msi *)data;
467                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
468                 break;
469         case VM_IOAPIC_ASSERT_IRQ:
470                 ioapic_irq = (struct vm_ioapic_irq *)data;
471                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
472                 break;
473         case VM_IOAPIC_DEASSERT_IRQ:
474                 ioapic_irq = (struct vm_ioapic_irq *)data;
475                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
476                 break;
477         case VM_IOAPIC_PULSE_IRQ:
478                 ioapic_irq = (struct vm_ioapic_irq *)data;
479                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
480                 break;
481         case VM_IOAPIC_PINCOUNT:
482                 *(int *)data = vioapic_pincount(sc->vm);
483                 break;
484         case VM_ISA_ASSERT_IRQ:
485                 isa_irq = (struct vm_isa_irq *)data;
486                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
487                 if (error == 0 && isa_irq->ioapic_irq != -1)
488                         error = vioapic_assert_irq(sc->vm,
489                             isa_irq->ioapic_irq);
490                 break;
491         case VM_ISA_DEASSERT_IRQ:
492                 isa_irq = (struct vm_isa_irq *)data;
493                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
494                 if (error == 0 && isa_irq->ioapic_irq != -1)
495                         error = vioapic_deassert_irq(sc->vm,
496                             isa_irq->ioapic_irq);
497                 break;
498         case VM_ISA_PULSE_IRQ:
499                 isa_irq = (struct vm_isa_irq *)data;
500                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
501                 if (error == 0 && isa_irq->ioapic_irq != -1)
502                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
503                 break;
504         case VM_ISA_SET_IRQ_TRIGGER:
505                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
506                 error = vatpic_set_irq_trigger(sc->vm,
507                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
508                 break;
509         case VM_MMAP_GETNEXT:
510                 mm = (struct vm_memmap *)data;
511                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
512                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
513                 break;
514         case VM_MMAP_MEMSEG:
515                 mm = (struct vm_memmap *)data;
516                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
517                     mm->len, mm->prot, mm->flags);
518                 break;
519         case VM_ALLOC_MEMSEG:
520                 error = alloc_memseg(sc, (struct vm_memseg *)data);
521                 break;
522         case VM_GET_MEMSEG:
523                 error = get_memseg(sc, (struct vm_memseg *)data);
524                 break;
525         case VM_GET_REGISTER:
526                 vmreg = (struct vm_register *)data;
527                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
528                                         &vmreg->regval);
529                 break;
530         case VM_SET_REGISTER:
531                 vmreg = (struct vm_register *)data;
532                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
533                                         vmreg->regval);
534                 break;
535         case VM_SET_SEGMENT_DESCRIPTOR:
536                 vmsegdesc = (struct vm_seg_desc *)data;
537                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
538                                         vmsegdesc->regnum,
539                                         &vmsegdesc->desc);
540                 break;
541         case VM_GET_SEGMENT_DESCRIPTOR:
542                 vmsegdesc = (struct vm_seg_desc *)data;
543                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
544                                         vmsegdesc->regnum,
545                                         &vmsegdesc->desc);
546                 break;
547         case VM_GET_CAPABILITY:
548                 vmcap = (struct vm_capability *)data;
549                 error = vm_get_capability(sc->vm, vmcap->cpuid,
550                                           vmcap->captype,
551                                           &vmcap->capval);
552                 break;
553         case VM_SET_CAPABILITY:
554                 vmcap = (struct vm_capability *)data;
555                 error = vm_set_capability(sc->vm, vmcap->cpuid,
556                                           vmcap->captype,
557                                           vmcap->capval);
558                 break;
559         case VM_SET_X2APIC_STATE:
560                 x2apic = (struct vm_x2apic *)data;
561                 error = vm_set_x2apic_state(sc->vm,
562                                             x2apic->cpuid, x2apic->state);
563                 break;
564         case VM_GET_X2APIC_STATE:
565                 x2apic = (struct vm_x2apic *)data;
566                 error = vm_get_x2apic_state(sc->vm,
567                                             x2apic->cpuid, &x2apic->state);
568                 break;
569         case VM_GET_GPA_PMAP:
570                 gpapte = (struct vm_gpa_pte *)data;
571                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
572                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
573                 error = 0;
574                 break;
575         case VM_GET_HPET_CAPABILITIES:
576                 error = vhpet_getcap((struct vm_hpet_cap *)data);
577                 break;
578         case VM_GLA2GPA: {
579                 CTASSERT(PROT_READ == VM_PROT_READ);
580                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
581                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
582                 gg = (struct vm_gla2gpa *)data;
583                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
584                     gg->prot, &gg->gpa, &gg->fault);
585                 KASSERT(error == 0 || error == EFAULT,
586                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
587                 break;
588         }
589         case VM_ACTIVATE_CPU:
590                 vac = (struct vm_activate_cpu *)data;
591                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
592                 break;
593         case VM_GET_CPUS:
594                 error = 0;
595                 vm_cpuset = (struct vm_cpuset *)data;
596                 size = vm_cpuset->cpusetsize;
597                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
598                         error = ERANGE;
599                         break;
600                 }
601                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
602                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
603                         *cpuset = vm_active_cpus(sc->vm);
604                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
605                         *cpuset = vm_suspended_cpus(sc->vm);
606                 else
607                         error = EINVAL;
608                 if (error == 0)
609                         error = copyout(cpuset, vm_cpuset->cpus, size);
610                 free(cpuset, M_TEMP);
611                 break;
612         case VM_SET_INTINFO:
613                 vmii = (struct vm_intinfo *)data;
614                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
615                 break;
616         case VM_GET_INTINFO:
617                 vmii = (struct vm_intinfo *)data;
618                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
619                     &vmii->info2);
620                 break;
621         case VM_RTC_WRITE:
622                 rtcdata = (struct vm_rtc_data *)data;
623                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
624                     rtcdata->value);
625                 break;
626         case VM_RTC_READ:
627                 rtcdata = (struct vm_rtc_data *)data;
628                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
629                     &rtcdata->value);
630                 break;
631         case VM_RTC_SETTIME:
632                 rtctime = (struct vm_rtc_time *)data;
633                 error = vrtc_set_time(sc->vm, rtctime->secs);
634                 break;
635         case VM_RTC_GETTIME:
636                 error = 0;
637                 rtctime = (struct vm_rtc_time *)data;
638                 rtctime->secs = vrtc_get_time(sc->vm);
639                 break;
640         case VM_RESTART_INSTRUCTION:
641                 error = vm_restart_instruction(sc->vm, vcpu);
642                 break;
643         default:
644                 error = ENOTTY;
645                 break;
646         }
647
648         if (state_changed == 1)
649                 vcpu_unlock_one(sc, vcpu);
650         else if (state_changed == 2)
651                 vcpu_unlock_all(sc);
652
653 done:
654         /* Make sure that no handler returns a bogus value like ERESTART */
655         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
656         return (error);
657 }
658
659 static int
660 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
661     struct vm_object **objp, int nprot)
662 {
663         struct vmmdev_softc *sc;
664         vm_paddr_t gpa;
665         size_t len;
666         vm_ooffset_t segoff, first, last;
667         int error, found, segid;
668         bool sysmem;
669
670         first = *offset;
671         last = first + mapsize;
672         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
673                 return (EINVAL);
674
675         sc = vmmdev_lookup2(cdev);
676         if (sc == NULL) {
677                 /* virtual machine is in the process of being created */
678                 return (EINVAL);
679         }
680
681         /*
682          * Get a read lock on the guest memory map by freezing any vcpu.
683          */
684         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
685         if (error)
686                 return (error);
687
688         gpa = 0;
689         found = 0;
690         while (!found) {
691                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
692                     NULL, NULL);
693                 if (error)
694                         break;
695
696                 if (first >= gpa && last <= gpa + len)
697                         found = 1;
698                 else
699                         gpa += len;
700         }
701
702         if (found) {
703                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
704                 KASSERT(error == 0 && *objp != NULL,
705                     ("%s: invalid memory segment %d", __func__, segid));
706                 if (sysmem) {
707                         vm_object_reference(*objp);
708                         *offset = segoff + (first - gpa);
709                 } else {
710                         error = EINVAL;
711                 }
712         }
713         vcpu_unlock_one(sc, VM_MAXCPU - 1);
714         return (error);
715 }
716
717 static void
718 vmmdev_destroy(void *arg)
719 {
720         struct vmmdev_softc *sc = arg;
721         struct devmem_softc *dsc;
722         int error;
723
724         error = vcpu_lock_all(sc);
725         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
726
727         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
728                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
729                 SLIST_REMOVE_HEAD(&sc->devmem, link);
730                 free(dsc->name, M_VMMDEV);
731                 free(dsc, M_VMMDEV);
732         }
733
734         if (sc->cdev != NULL)
735                 destroy_dev(sc->cdev);
736
737         if (sc->vm != NULL)
738                 vm_destroy(sc->vm);
739
740         if ((sc->flags & VSC_LINKED) != 0) {
741                 mtx_lock(&vmmdev_mtx);
742                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
743                 mtx_unlock(&vmmdev_mtx);
744         }
745
746         free(sc, M_VMMDEV);
747 }
748
749 static int
750 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
751 {
752         int error;
753         char buf[VM_MAX_NAMELEN];
754         struct devmem_softc *dsc;
755         struct vmmdev_softc *sc;
756         struct cdev *cdev;
757
758         strlcpy(buf, "beavis", sizeof(buf));
759         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
760         if (error != 0 || req->newptr == NULL)
761                 return (error);
762
763         mtx_lock(&vmmdev_mtx);
764         sc = vmmdev_lookup(buf);
765         if (sc == NULL || sc->cdev == NULL) {
766                 mtx_unlock(&vmmdev_mtx);
767                 return (EINVAL);
768         }
769
770         /*
771          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
772          * goes down to 0 so we should not do it again in the callback.
773          *
774          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
775          * is scheduled for destruction.
776          */
777         cdev = sc->cdev;
778         sc->cdev = NULL;                
779         mtx_unlock(&vmmdev_mtx);
780
781         /*
782          * Schedule all cdevs to be destroyed:
783          *
784          * - any new operations on the 'cdev' will return an error (ENXIO).
785          *
786          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
787          *   be destroyed and the callback will be invoked in a taskqueue
788          *   context.
789          *
790          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
791          */
792         SLIST_FOREACH(dsc, &sc->devmem, link) {
793                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
794                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
795         }
796         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
797         return (0);
798 }
799 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
800             NULL, 0, sysctl_vmm_destroy, "A", NULL);
801
802 static struct cdevsw vmmdevsw = {
803         .d_name         = "vmmdev",
804         .d_version      = D_VERSION,
805         .d_ioctl        = vmmdev_ioctl,
806         .d_mmap_single  = vmmdev_mmap_single,
807         .d_read         = vmmdev_rw,
808         .d_write        = vmmdev_rw,
809 };
810
811 static int
812 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
813 {
814         int error;
815         struct vm *vm;
816         struct cdev *cdev;
817         struct vmmdev_softc *sc, *sc2;
818         char buf[VM_MAX_NAMELEN];
819
820         strlcpy(buf, "beavis", sizeof(buf));
821         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
822         if (error != 0 || req->newptr == NULL)
823                 return (error);
824
825         mtx_lock(&vmmdev_mtx);
826         sc = vmmdev_lookup(buf);
827         mtx_unlock(&vmmdev_mtx);
828         if (sc != NULL)
829                 return (EEXIST);
830
831         error = vm_create(buf, &vm);
832         if (error != 0)
833                 return (error);
834
835         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
836         sc->vm = vm;
837         SLIST_INIT(&sc->devmem);
838
839         /*
840          * Lookup the name again just in case somebody sneaked in when we
841          * dropped the lock.
842          */
843         mtx_lock(&vmmdev_mtx);
844         sc2 = vmmdev_lookup(buf);
845         if (sc2 == NULL) {
846                 SLIST_INSERT_HEAD(&head, sc, link);
847                 sc->flags |= VSC_LINKED;
848         }
849         mtx_unlock(&vmmdev_mtx);
850
851         if (sc2 != NULL) {
852                 vmmdev_destroy(sc);
853                 return (EEXIST);
854         }
855
856         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
857                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
858         if (error != 0) {
859                 vmmdev_destroy(sc);
860                 return (error);
861         }
862
863         mtx_lock(&vmmdev_mtx);
864         sc->cdev = cdev;
865         sc->cdev->si_drv1 = sc;
866         mtx_unlock(&vmmdev_mtx);
867
868         return (0);
869 }
870 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
871             NULL, 0, sysctl_vmm_create, "A", NULL);
872
873 void
874 vmmdev_init(void)
875 {
876         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
877 }
878
879 int
880 vmmdev_cleanup(void)
881 {
882         int error;
883
884         if (SLIST_EMPTY(&head))
885                 error = 0;
886         else
887                 error = EBUSY;
888
889         return (error);
890 }
891
892 static int
893 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
894     struct vm_object **objp, int nprot)
895 {
896         struct devmem_softc *dsc;
897         vm_ooffset_t first, last;
898         size_t seglen;
899         int error;
900         bool sysmem;
901
902         dsc = cdev->si_drv1;
903         if (dsc == NULL) {
904                 /* 'cdev' has been created but is not ready for use */
905                 return (ENXIO);
906         }
907
908         first = *offset;
909         last = *offset + len;
910         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
911                 return (EINVAL);
912
913         error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
914         if (error)
915                 return (error);
916
917         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
918         KASSERT(error == 0 && !sysmem && *objp != NULL,
919             ("%s: invalid devmem segment %d", __func__, dsc->segid));
920
921         vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
922
923         if (seglen >= last) {
924                 vm_object_reference(*objp);
925                 return (0);
926         } else {
927                 return (EINVAL);
928         }
929 }
930
931 static struct cdevsw devmemsw = {
932         .d_name         = "devmem",
933         .d_version      = D_VERSION,
934         .d_mmap_single  = devmem_mmap_single,
935 };
936
937 static int
938 devmem_create_cdev(const char *vmname, int segid, char *devname)
939 {
940         struct devmem_softc *dsc;
941         struct vmmdev_softc *sc;
942         struct cdev *cdev;
943         int error;
944
945         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
946             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
947         if (error)
948                 return (error);
949
950         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
951
952         mtx_lock(&vmmdev_mtx);
953         sc = vmmdev_lookup(vmname);
954         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
955         if (sc->cdev == NULL) {
956                 /* virtual machine is being created or destroyed */
957                 mtx_unlock(&vmmdev_mtx);
958                 free(dsc, M_VMMDEV);
959                 destroy_dev_sched_cb(cdev, NULL, 0);
960                 return (ENODEV);
961         }
962
963         dsc->segid = segid;
964         dsc->name = devname;
965         dsc->cdev = cdev;
966         dsc->sc = sc;
967         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
968         mtx_unlock(&vmmdev_mtx);
969
970         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
971         cdev->si_drv1 = dsc;
972         return (0);
973 }
974
975 static void
976 devmem_destroy(void *arg)
977 {
978         struct devmem_softc *dsc = arg;
979
980         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
981         dsc->cdev = NULL;
982         dsc->sc = NULL;
983 }