]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
Import tzdata 2018d
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/queue.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/malloc.h>
40 #include <sys/conf.h>
41 #include <sys/sysctl.h>
42 #include <sys/libkern.h>
43 #include <sys/ioccom.h>
44 #include <sys/mman.h>
45 #include <sys/uio.h>
46
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <machine/vmm_instruction_emul.h>
55 #include <machine/vmm_dev.h>
56
57 #include "vmm_lapic.h"
58 #include "vmm_stat.h"
59 #include "vmm_mem.h"
60 #include "io/ppt.h"
61 #include "io/vatpic.h"
62 #include "io/vioapic.h"
63 #include "io/vhpet.h"
64 #include "io/vrtc.h"
65
66 struct devmem_softc {
67         int     segid;
68         char    *name;
69         struct cdev *cdev;
70         struct vmmdev_softc *sc;
71         SLIST_ENTRY(devmem_softc) link;
72 };
73
74 struct vmmdev_softc {
75         struct vm       *vm;            /* vm instance cookie */
76         struct cdev     *cdev;
77         SLIST_ENTRY(vmmdev_softc) link;
78         SLIST_HEAD(, devmem_softc) devmem;
79         int             flags;
80 };
81 #define VSC_LINKED              0x01
82
83 static SLIST_HEAD(, vmmdev_softc) head;
84
85 static struct mtx vmmdev_mtx;
86
87 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
88
89 SYSCTL_DECL(_hw_vmm);
90
91 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
92 static void devmem_destroy(void *arg);
93
94 static int
95 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
96 {
97         int error;
98
99         if (vcpu < 0 || vcpu >= VM_MAXCPU)
100                 return (EINVAL);
101
102         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
103         return (error);
104 }
105
106 static void
107 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
108 {
109         enum vcpu_state state;
110
111         state = vcpu_get_state(sc->vm, vcpu, NULL);
112         if (state != VCPU_FROZEN) {
113                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
114                     vcpu, state);
115         }
116
117         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
118 }
119
120 static int
121 vcpu_lock_all(struct vmmdev_softc *sc)
122 {
123         int error, vcpu;
124
125         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
126                 error = vcpu_lock_one(sc, vcpu);
127                 if (error)
128                         break;
129         }
130
131         if (error) {
132                 while (--vcpu >= 0)
133                         vcpu_unlock_one(sc, vcpu);
134         }
135
136         return (error);
137 }
138
139 static void
140 vcpu_unlock_all(struct vmmdev_softc *sc)
141 {
142         int vcpu;
143
144         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
145                 vcpu_unlock_one(sc, vcpu);
146 }
147
148 static struct vmmdev_softc *
149 vmmdev_lookup(const char *name)
150 {
151         struct vmmdev_softc *sc;
152
153 #ifdef notyet   /* XXX kernel is not compiled with invariants */
154         mtx_assert(&vmmdev_mtx, MA_OWNED);
155 #endif
156
157         SLIST_FOREACH(sc, &head, link) {
158                 if (strcmp(name, vm_name(sc->vm)) == 0)
159                         break;
160         }
161
162         return (sc);
163 }
164
165 static struct vmmdev_softc *
166 vmmdev_lookup2(struct cdev *cdev)
167 {
168
169         return (cdev->si_drv1);
170 }
171
172 static int
173 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
174 {
175         int error, off, c, prot;
176         vm_paddr_t gpa;
177         void *hpa, *cookie;
178         struct vmmdev_softc *sc;
179
180         sc = vmmdev_lookup2(cdev);
181         if (sc == NULL)
182                 return (ENXIO);
183
184         /*
185          * Get a read lock on the guest memory map by freezing any vcpu.
186          */
187         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
188         if (error)
189                 return (error);
190
191         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
192         while (uio->uio_resid > 0 && error == 0) {
193                 gpa = uio->uio_offset;
194                 off = gpa & PAGE_MASK;
195                 c = min(uio->uio_resid, PAGE_SIZE - off);
196
197                 /*
198                  * The VM has a hole in its physical memory map. If we want to
199                  * use 'dd' to inspect memory beyond the hole we need to
200                  * provide bogus data for memory that lies in the hole.
201                  *
202                  * Since this device does not support lseek(2), dd(1) will
203                  * read(2) blocks of data to simulate the lseek(2).
204                  */
205                 hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
206                 if (hpa == NULL) {
207                         if (uio->uio_rw == UIO_READ)
208                                 error = uiomove(__DECONST(void *, zero_region),
209                                     c, uio);
210                         else
211                                 error = EFAULT;
212                 } else {
213                         error = uiomove(hpa, c, uio);
214                         vm_gpa_release(cookie);
215                 }
216         }
217         vcpu_unlock_one(sc, VM_MAXCPU - 1);
218         return (error);
219 }
220
221 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
222
223 static int
224 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
225 {
226         struct devmem_softc *dsc;
227         int error;
228         bool sysmem;
229
230         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
231         if (error || mseg->len == 0)
232                 return (error);
233
234         if (!sysmem) {
235                 SLIST_FOREACH(dsc, &sc->devmem, link) {
236                         if (dsc->segid == mseg->segid)
237                                 break;
238                 }
239                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
240                     __func__, mseg->segid));
241                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
242         } else {
243                 bzero(mseg->name, sizeof(mseg->name));
244         }
245
246         return (error);
247 }
248
249 static int
250 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
251 {
252         char *name;
253         int error;
254         bool sysmem;
255
256         error = 0;
257         name = NULL;
258         sysmem = true;
259
260         if (VM_MEMSEG_NAME(mseg)) {
261                 sysmem = false;
262                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
263                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
264                 if (error)
265                         goto done;
266         }
267
268         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
269         if (error)
270                 goto done;
271
272         if (VM_MEMSEG_NAME(mseg)) {
273                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
274                 if (error)
275                         vm_free_memseg(sc->vm, mseg->segid);
276                 else
277                         name = NULL;    /* freed when 'cdev' is destroyed */
278         }
279 done:
280         free(name, M_VMMDEV);
281         return (error);
282 }
283
284 static int
285 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
286     uint64_t *regval)
287 {
288         int error, i;
289
290         error = 0;
291         for (i = 0; i < count; i++) {
292                 error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
293                 if (error)
294                         break;
295         }
296         return (error);
297 }
298
299 static int
300 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
301     uint64_t *regval)
302 {
303         int error, i;
304
305         error = 0;
306         for (i = 0; i < count; i++) {
307                 error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
308                 if (error)
309                         break;
310         }
311         return (error);
312 }
313
314 static int
315 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
316              struct thread *td)
317 {
318         int error, vcpu, state_changed, size;
319         cpuset_t *cpuset;
320         struct vmmdev_softc *sc;
321         struct vm_register *vmreg;
322         struct vm_seg_desc *vmsegdesc;
323         struct vm_register_set *vmregset;
324         struct vm_run *vmrun;
325         struct vm_exception *vmexc;
326         struct vm_lapic_irq *vmirq;
327         struct vm_lapic_msi *vmmsi;
328         struct vm_ioapic_irq *ioapic_irq;
329         struct vm_isa_irq *isa_irq;
330         struct vm_isa_irq_trigger *isa_irq_trigger;
331         struct vm_capability *vmcap;
332         struct vm_pptdev *pptdev;
333         struct vm_pptdev_mmio *pptmmio;
334         struct vm_pptdev_msi *pptmsi;
335         struct vm_pptdev_msix *pptmsix;
336         struct vm_nmi *vmnmi;
337         struct vm_stats *vmstats;
338         struct vm_stat_desc *statdesc;
339         struct vm_x2apic *x2apic;
340         struct vm_gpa_pte *gpapte;
341         struct vm_suspend *vmsuspend;
342         struct vm_gla2gpa *gg;
343         struct vm_activate_cpu *vac;
344         struct vm_cpuset *vm_cpuset;
345         struct vm_intinfo *vmii;
346         struct vm_rtc_time *rtctime;
347         struct vm_rtc_data *rtcdata;
348         struct vm_memmap *mm;
349         uint64_t *regvals;
350         int *regnums;
351
352         sc = vmmdev_lookup2(cdev);
353         if (sc == NULL)
354                 return (ENXIO);
355
356         error = 0;
357         vcpu = -1;
358         state_changed = 0;
359
360         /*
361          * Some VMM ioctls can operate only on vcpus that are not running.
362          */
363         switch (cmd) {
364         case VM_RUN:
365         case VM_GET_REGISTER:
366         case VM_SET_REGISTER:
367         case VM_GET_SEGMENT_DESCRIPTOR:
368         case VM_SET_SEGMENT_DESCRIPTOR:
369         case VM_GET_REGISTER_SET:
370         case VM_SET_REGISTER_SET:
371         case VM_INJECT_EXCEPTION:
372         case VM_GET_CAPABILITY:
373         case VM_SET_CAPABILITY:
374         case VM_PPTDEV_MSI:
375         case VM_PPTDEV_MSIX:
376         case VM_SET_X2APIC_STATE:
377         case VM_GLA2GPA:
378         case VM_GLA2GPA_NOFAULT:
379         case VM_ACTIVATE_CPU:
380         case VM_SET_INTINFO:
381         case VM_GET_INTINFO:
382         case VM_RESTART_INSTRUCTION:
383                 /*
384                  * XXX fragile, handle with care
385                  * Assumes that the first field of the ioctl data is the vcpu.
386                  */
387                 vcpu = *(int *)data;
388                 error = vcpu_lock_one(sc, vcpu);
389                 if (error)
390                         goto done;
391                 state_changed = 1;
392                 break;
393
394         case VM_MAP_PPTDEV_MMIO:
395         case VM_BIND_PPTDEV:
396         case VM_UNBIND_PPTDEV:
397         case VM_ALLOC_MEMSEG:
398         case VM_MMAP_MEMSEG:
399         case VM_REINIT:
400                 /*
401                  * ioctls that operate on the entire virtual machine must
402                  * prevent all vcpus from running.
403                  */
404                 error = vcpu_lock_all(sc);
405                 if (error)
406                         goto done;
407                 state_changed = 2;
408                 break;
409
410         case VM_GET_MEMSEG:
411         case VM_MMAP_GETNEXT:
412                 /*
413                  * Lock a vcpu to make sure that the memory map cannot be
414                  * modified while it is being inspected.
415                  */
416                 vcpu = VM_MAXCPU - 1;
417                 error = vcpu_lock_one(sc, vcpu);
418                 if (error)
419                         goto done;
420                 state_changed = 1;
421                 break;
422
423         default:
424                 break;
425         }
426
427         switch(cmd) {
428         case VM_RUN:
429                 vmrun = (struct vm_run *)data;
430                 error = vm_run(sc->vm, vmrun);
431                 break;
432         case VM_SUSPEND:
433                 vmsuspend = (struct vm_suspend *)data;
434                 error = vm_suspend(sc->vm, vmsuspend->how);
435                 break;
436         case VM_REINIT:
437                 error = vm_reinit(sc->vm);
438                 break;
439         case VM_STAT_DESC: {
440                 statdesc = (struct vm_stat_desc *)data;
441                 error = vmm_stat_desc_copy(statdesc->index,
442                                         statdesc->desc, sizeof(statdesc->desc));
443                 break;
444         }
445         case VM_STATS: {
446                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
447                 vmstats = (struct vm_stats *)data;
448                 getmicrotime(&vmstats->tv);
449                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
450                                       &vmstats->num_entries, vmstats->statbuf);
451                 break;
452         }
453         case VM_PPTDEV_MSI:
454                 pptmsi = (struct vm_pptdev_msi *)data;
455                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
456                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
457                                       pptmsi->addr, pptmsi->msg,
458                                       pptmsi->numvec);
459                 break;
460         case VM_PPTDEV_MSIX:
461                 pptmsix = (struct vm_pptdev_msix *)data;
462                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
463                                        pptmsix->bus, pptmsix->slot, 
464                                        pptmsix->func, pptmsix->idx,
465                                        pptmsix->addr, pptmsix->msg,
466                                        pptmsix->vector_control);
467                 break;
468         case VM_MAP_PPTDEV_MMIO:
469                 pptmmio = (struct vm_pptdev_mmio *)data;
470                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
471                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
472                                      pptmmio->hpa);
473                 break;
474         case VM_BIND_PPTDEV:
475                 pptdev = (struct vm_pptdev *)data;
476                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
477                                          pptdev->func);
478                 break;
479         case VM_UNBIND_PPTDEV:
480                 pptdev = (struct vm_pptdev *)data;
481                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
482                                            pptdev->func);
483                 break;
484         case VM_INJECT_EXCEPTION:
485                 vmexc = (struct vm_exception *)data;
486                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
487                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
488                     vmexc->restart_instruction);
489                 break;
490         case VM_INJECT_NMI:
491                 vmnmi = (struct vm_nmi *)data;
492                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
493                 break;
494         case VM_LAPIC_IRQ:
495                 vmirq = (struct vm_lapic_irq *)data;
496                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
497                 break;
498         case VM_LAPIC_LOCAL_IRQ:
499                 vmirq = (struct vm_lapic_irq *)data;
500                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
501                     vmirq->vector);
502                 break;
503         case VM_LAPIC_MSI:
504                 vmmsi = (struct vm_lapic_msi *)data;
505                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
506                 break;
507         case VM_IOAPIC_ASSERT_IRQ:
508                 ioapic_irq = (struct vm_ioapic_irq *)data;
509                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
510                 break;
511         case VM_IOAPIC_DEASSERT_IRQ:
512                 ioapic_irq = (struct vm_ioapic_irq *)data;
513                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
514                 break;
515         case VM_IOAPIC_PULSE_IRQ:
516                 ioapic_irq = (struct vm_ioapic_irq *)data;
517                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
518                 break;
519         case VM_IOAPIC_PINCOUNT:
520                 *(int *)data = vioapic_pincount(sc->vm);
521                 break;
522         case VM_ISA_ASSERT_IRQ:
523                 isa_irq = (struct vm_isa_irq *)data;
524                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
525                 if (error == 0 && isa_irq->ioapic_irq != -1)
526                         error = vioapic_assert_irq(sc->vm,
527                             isa_irq->ioapic_irq);
528                 break;
529         case VM_ISA_DEASSERT_IRQ:
530                 isa_irq = (struct vm_isa_irq *)data;
531                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
532                 if (error == 0 && isa_irq->ioapic_irq != -1)
533                         error = vioapic_deassert_irq(sc->vm,
534                             isa_irq->ioapic_irq);
535                 break;
536         case VM_ISA_PULSE_IRQ:
537                 isa_irq = (struct vm_isa_irq *)data;
538                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
539                 if (error == 0 && isa_irq->ioapic_irq != -1)
540                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
541                 break;
542         case VM_ISA_SET_IRQ_TRIGGER:
543                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
544                 error = vatpic_set_irq_trigger(sc->vm,
545                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
546                 break;
547         case VM_MMAP_GETNEXT:
548                 mm = (struct vm_memmap *)data;
549                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
550                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
551                 break;
552         case VM_MMAP_MEMSEG:
553                 mm = (struct vm_memmap *)data;
554                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
555                     mm->len, mm->prot, mm->flags);
556                 break;
557         case VM_ALLOC_MEMSEG:
558                 error = alloc_memseg(sc, (struct vm_memseg *)data);
559                 break;
560         case VM_GET_MEMSEG:
561                 error = get_memseg(sc, (struct vm_memseg *)data);
562                 break;
563         case VM_GET_REGISTER:
564                 vmreg = (struct vm_register *)data;
565                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
566                                         &vmreg->regval);
567                 break;
568         case VM_SET_REGISTER:
569                 vmreg = (struct vm_register *)data;
570                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
571                                         vmreg->regval);
572                 break;
573         case VM_SET_SEGMENT_DESCRIPTOR:
574                 vmsegdesc = (struct vm_seg_desc *)data;
575                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
576                                         vmsegdesc->regnum,
577                                         &vmsegdesc->desc);
578                 break;
579         case VM_GET_SEGMENT_DESCRIPTOR:
580                 vmsegdesc = (struct vm_seg_desc *)data;
581                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
582                                         vmsegdesc->regnum,
583                                         &vmsegdesc->desc);
584                 break;
585         case VM_GET_REGISTER_SET:
586                 vmregset = (struct vm_register_set *)data;
587                 if (vmregset->count > VM_REG_LAST) {
588                         error = EINVAL;
589                         break;
590                 }
591                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
592                     M_WAITOK);
593                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
594                     M_WAITOK);
595                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
596                     vmregset->count);
597                 if (error == 0)
598                         error = vm_get_register_set(sc->vm, vmregset->cpuid,
599                             vmregset->count, regnums, regvals);
600                 if (error == 0)
601                         error = copyout(regvals, vmregset->regvals,
602                             sizeof(regvals[0]) * vmregset->count);
603                 free(regvals, M_VMMDEV);
604                 free(regnums, M_VMMDEV);
605                 break;
606         case VM_SET_REGISTER_SET:
607                 vmregset = (struct vm_register_set *)data;
608                 if (vmregset->count > VM_REG_LAST) {
609                         error = EINVAL;
610                         break;
611                 }
612                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
613                     M_WAITOK);
614                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
615                     M_WAITOK);
616                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
617                     vmregset->count);
618                 if (error == 0)
619                         error = copyin(vmregset->regvals, regvals,
620                             sizeof(regvals[0]) * vmregset->count);
621                 if (error == 0)
622                         error = vm_set_register_set(sc->vm, vmregset->cpuid,
623                             vmregset->count, regnums, regvals);
624                 free(regvals, M_VMMDEV);
625                 free(regnums, M_VMMDEV);
626                 break;
627         case VM_GET_CAPABILITY:
628                 vmcap = (struct vm_capability *)data;
629                 error = vm_get_capability(sc->vm, vmcap->cpuid,
630                                           vmcap->captype,
631                                           &vmcap->capval);
632                 break;
633         case VM_SET_CAPABILITY:
634                 vmcap = (struct vm_capability *)data;
635                 error = vm_set_capability(sc->vm, vmcap->cpuid,
636                                           vmcap->captype,
637                                           vmcap->capval);
638                 break;
639         case VM_SET_X2APIC_STATE:
640                 x2apic = (struct vm_x2apic *)data;
641                 error = vm_set_x2apic_state(sc->vm,
642                                             x2apic->cpuid, x2apic->state);
643                 break;
644         case VM_GET_X2APIC_STATE:
645                 x2apic = (struct vm_x2apic *)data;
646                 error = vm_get_x2apic_state(sc->vm,
647                                             x2apic->cpuid, &x2apic->state);
648                 break;
649         case VM_GET_GPA_PMAP:
650                 gpapte = (struct vm_gpa_pte *)data;
651                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
652                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
653                 error = 0;
654                 break;
655         case VM_GET_HPET_CAPABILITIES:
656                 error = vhpet_getcap((struct vm_hpet_cap *)data);
657                 break;
658         case VM_GLA2GPA: {
659                 CTASSERT(PROT_READ == VM_PROT_READ);
660                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
661                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
662                 gg = (struct vm_gla2gpa *)data;
663                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
664                     gg->prot, &gg->gpa, &gg->fault);
665                 KASSERT(error == 0 || error == EFAULT,
666                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
667                 break;
668         }
669         case VM_GLA2GPA_NOFAULT:
670                 gg = (struct vm_gla2gpa *)data;
671                 error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
672                     gg->gla, gg->prot, &gg->gpa, &gg->fault);
673                 KASSERT(error == 0 || error == EFAULT,
674                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
675                 break;
676         case VM_ACTIVATE_CPU:
677                 vac = (struct vm_activate_cpu *)data;
678                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
679                 break;
680         case VM_GET_CPUS:
681                 error = 0;
682                 vm_cpuset = (struct vm_cpuset *)data;
683                 size = vm_cpuset->cpusetsize;
684                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
685                         error = ERANGE;
686                         break;
687                 }
688                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
689                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
690                         *cpuset = vm_active_cpus(sc->vm);
691                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
692                         *cpuset = vm_suspended_cpus(sc->vm);
693                 else
694                         error = EINVAL;
695                 if (error == 0)
696                         error = copyout(cpuset, vm_cpuset->cpus, size);
697                 free(cpuset, M_TEMP);
698                 break;
699         case VM_SET_INTINFO:
700                 vmii = (struct vm_intinfo *)data;
701                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
702                 break;
703         case VM_GET_INTINFO:
704                 vmii = (struct vm_intinfo *)data;
705                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
706                     &vmii->info2);
707                 break;
708         case VM_RTC_WRITE:
709                 rtcdata = (struct vm_rtc_data *)data;
710                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
711                     rtcdata->value);
712                 break;
713         case VM_RTC_READ:
714                 rtcdata = (struct vm_rtc_data *)data;
715                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
716                     &rtcdata->value);
717                 break;
718         case VM_RTC_SETTIME:
719                 rtctime = (struct vm_rtc_time *)data;
720                 error = vrtc_set_time(sc->vm, rtctime->secs);
721                 break;
722         case VM_RTC_GETTIME:
723                 error = 0;
724                 rtctime = (struct vm_rtc_time *)data;
725                 rtctime->secs = vrtc_get_time(sc->vm);
726                 break;
727         case VM_RESTART_INSTRUCTION:
728                 error = vm_restart_instruction(sc->vm, vcpu);
729                 break;
730         default:
731                 error = ENOTTY;
732                 break;
733         }
734
735         if (state_changed == 1)
736                 vcpu_unlock_one(sc, vcpu);
737         else if (state_changed == 2)
738                 vcpu_unlock_all(sc);
739
740 done:
741         /* Make sure that no handler returns a bogus value like ERESTART */
742         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
743         return (error);
744 }
745
746 static int
747 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
748     struct vm_object **objp, int nprot)
749 {
750         struct vmmdev_softc *sc;
751         vm_paddr_t gpa;
752         size_t len;
753         vm_ooffset_t segoff, first, last;
754         int error, found, segid;
755         bool sysmem;
756
757         first = *offset;
758         last = first + mapsize;
759         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
760                 return (EINVAL);
761
762         sc = vmmdev_lookup2(cdev);
763         if (sc == NULL) {
764                 /* virtual machine is in the process of being created */
765                 return (EINVAL);
766         }
767
768         /*
769          * Get a read lock on the guest memory map by freezing any vcpu.
770          */
771         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
772         if (error)
773                 return (error);
774
775         gpa = 0;
776         found = 0;
777         while (!found) {
778                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
779                     NULL, NULL);
780                 if (error)
781                         break;
782
783                 if (first >= gpa && last <= gpa + len)
784                         found = 1;
785                 else
786                         gpa += len;
787         }
788
789         if (found) {
790                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
791                 KASSERT(error == 0 && *objp != NULL,
792                     ("%s: invalid memory segment %d", __func__, segid));
793                 if (sysmem) {
794                         vm_object_reference(*objp);
795                         *offset = segoff + (first - gpa);
796                 } else {
797                         error = EINVAL;
798                 }
799         }
800         vcpu_unlock_one(sc, VM_MAXCPU - 1);
801         return (error);
802 }
803
804 static void
805 vmmdev_destroy(void *arg)
806 {
807         struct vmmdev_softc *sc = arg;
808         struct devmem_softc *dsc;
809         int error;
810
811         error = vcpu_lock_all(sc);
812         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
813
814         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
815                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
816                 SLIST_REMOVE_HEAD(&sc->devmem, link);
817                 free(dsc->name, M_VMMDEV);
818                 free(dsc, M_VMMDEV);
819         }
820
821         if (sc->cdev != NULL)
822                 destroy_dev(sc->cdev);
823
824         if (sc->vm != NULL)
825                 vm_destroy(sc->vm);
826
827         if ((sc->flags & VSC_LINKED) != 0) {
828                 mtx_lock(&vmmdev_mtx);
829                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
830                 mtx_unlock(&vmmdev_mtx);
831         }
832
833         free(sc, M_VMMDEV);
834 }
835
836 static int
837 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
838 {
839         int error;
840         char buf[VM_MAX_NAMELEN];
841         struct devmem_softc *dsc;
842         struct vmmdev_softc *sc;
843         struct cdev *cdev;
844
845         strlcpy(buf, "beavis", sizeof(buf));
846         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
847         if (error != 0 || req->newptr == NULL)
848                 return (error);
849
850         mtx_lock(&vmmdev_mtx);
851         sc = vmmdev_lookup(buf);
852         if (sc == NULL || sc->cdev == NULL) {
853                 mtx_unlock(&vmmdev_mtx);
854                 return (EINVAL);
855         }
856
857         /*
858          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
859          * goes down to 0 so we should not do it again in the callback.
860          *
861          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
862          * is scheduled for destruction.
863          */
864         cdev = sc->cdev;
865         sc->cdev = NULL;                
866         mtx_unlock(&vmmdev_mtx);
867
868         /*
869          * Schedule all cdevs to be destroyed:
870          *
871          * - any new operations on the 'cdev' will return an error (ENXIO).
872          *
873          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
874          *   be destroyed and the callback will be invoked in a taskqueue
875          *   context.
876          *
877          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
878          */
879         SLIST_FOREACH(dsc, &sc->devmem, link) {
880                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
881                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
882         }
883         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
884         return (0);
885 }
886 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
887             NULL, 0, sysctl_vmm_destroy, "A", NULL);
888
889 static struct cdevsw vmmdevsw = {
890         .d_name         = "vmmdev",
891         .d_version      = D_VERSION,
892         .d_ioctl        = vmmdev_ioctl,
893         .d_mmap_single  = vmmdev_mmap_single,
894         .d_read         = vmmdev_rw,
895         .d_write        = vmmdev_rw,
896 };
897
898 static int
899 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
900 {
901         int error;
902         struct vm *vm;
903         struct cdev *cdev;
904         struct vmmdev_softc *sc, *sc2;
905         char buf[VM_MAX_NAMELEN];
906
907         strlcpy(buf, "beavis", sizeof(buf));
908         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
909         if (error != 0 || req->newptr == NULL)
910                 return (error);
911
912         mtx_lock(&vmmdev_mtx);
913         sc = vmmdev_lookup(buf);
914         mtx_unlock(&vmmdev_mtx);
915         if (sc != NULL)
916                 return (EEXIST);
917
918         error = vm_create(buf, &vm);
919         if (error != 0)
920                 return (error);
921
922         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
923         sc->vm = vm;
924         SLIST_INIT(&sc->devmem);
925
926         /*
927          * Lookup the name again just in case somebody sneaked in when we
928          * dropped the lock.
929          */
930         mtx_lock(&vmmdev_mtx);
931         sc2 = vmmdev_lookup(buf);
932         if (sc2 == NULL) {
933                 SLIST_INSERT_HEAD(&head, sc, link);
934                 sc->flags |= VSC_LINKED;
935         }
936         mtx_unlock(&vmmdev_mtx);
937
938         if (sc2 != NULL) {
939                 vmmdev_destroy(sc);
940                 return (EEXIST);
941         }
942
943         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
944                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
945         if (error != 0) {
946                 vmmdev_destroy(sc);
947                 return (error);
948         }
949
950         mtx_lock(&vmmdev_mtx);
951         sc->cdev = cdev;
952         sc->cdev->si_drv1 = sc;
953         mtx_unlock(&vmmdev_mtx);
954
955         return (0);
956 }
957 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
958             NULL, 0, sysctl_vmm_create, "A", NULL);
959
960 void
961 vmmdev_init(void)
962 {
963         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
964 }
965
966 int
967 vmmdev_cleanup(void)
968 {
969         int error;
970
971         if (SLIST_EMPTY(&head))
972                 error = 0;
973         else
974                 error = EBUSY;
975
976         return (error);
977 }
978
979 static int
980 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
981     struct vm_object **objp, int nprot)
982 {
983         struct devmem_softc *dsc;
984         vm_ooffset_t first, last;
985         size_t seglen;
986         int error;
987         bool sysmem;
988
989         dsc = cdev->si_drv1;
990         if (dsc == NULL) {
991                 /* 'cdev' has been created but is not ready for use */
992                 return (ENXIO);
993         }
994
995         first = *offset;
996         last = *offset + len;
997         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
998                 return (EINVAL);
999
1000         error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
1001         if (error)
1002                 return (error);
1003
1004         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1005         KASSERT(error == 0 && !sysmem && *objp != NULL,
1006             ("%s: invalid devmem segment %d", __func__, dsc->segid));
1007
1008         vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
1009
1010         if (seglen >= last) {
1011                 vm_object_reference(*objp);
1012                 return (0);
1013         } else {
1014                 return (EINVAL);
1015         }
1016 }
1017
1018 static struct cdevsw devmemsw = {
1019         .d_name         = "devmem",
1020         .d_version      = D_VERSION,
1021         .d_mmap_single  = devmem_mmap_single,
1022 };
1023
1024 static int
1025 devmem_create_cdev(const char *vmname, int segid, char *devname)
1026 {
1027         struct devmem_softc *dsc;
1028         struct vmmdev_softc *sc;
1029         struct cdev *cdev;
1030         int error;
1031
1032         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1033             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1034         if (error)
1035                 return (error);
1036
1037         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1038
1039         mtx_lock(&vmmdev_mtx);
1040         sc = vmmdev_lookup(vmname);
1041         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1042         if (sc->cdev == NULL) {
1043                 /* virtual machine is being created or destroyed */
1044                 mtx_unlock(&vmmdev_mtx);
1045                 free(dsc, M_VMMDEV);
1046                 destroy_dev_sched_cb(cdev, NULL, 0);
1047                 return (ENODEV);
1048         }
1049
1050         dsc->segid = segid;
1051         dsc->name = devname;
1052         dsc->cdev = cdev;
1053         dsc->sc = sc;
1054         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1055         mtx_unlock(&vmmdev_mtx);
1056
1057         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
1058         cdev->si_drv1 = dsc;
1059         return (0);
1060 }
1061
1062 static void
1063 devmem_destroy(void *arg)
1064 {
1065         struct devmem_softc *dsc = arg;
1066
1067         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1068         dsc->cdev = NULL;
1069         dsc->sc = NULL;
1070 }