]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
MFV r329793, r329795:
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/queue.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/malloc.h>
40 #include <sys/conf.h>
41 #include <sys/sysctl.h>
42 #include <sys/libkern.h>
43 #include <sys/ioccom.h>
44 #include <sys/mman.h>
45 #include <sys/uio.h>
46
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <machine/vmm_instruction_emul.h>
55 #include <machine/vmm_dev.h>
56
57 #include "vmm_lapic.h"
58 #include "vmm_stat.h"
59 #include "vmm_mem.h"
60 #include "io/ppt.h"
61 #include "io/vatpic.h"
62 #include "io/vioapic.h"
63 #include "io/vhpet.h"
64 #include "io/vrtc.h"
65
66 struct devmem_softc {
67         int     segid;
68         char    *name;
69         struct cdev *cdev;
70         struct vmmdev_softc *sc;
71         SLIST_ENTRY(devmem_softc) link;
72 };
73
74 struct vmmdev_softc {
75         struct vm       *vm;            /* vm instance cookie */
76         struct cdev     *cdev;
77         SLIST_ENTRY(vmmdev_softc) link;
78         SLIST_HEAD(, devmem_softc) devmem;
79         int             flags;
80 };
81 #define VSC_LINKED              0x01
82
83 static SLIST_HEAD(, vmmdev_softc) head;
84
85 static struct mtx vmmdev_mtx;
86
87 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
88
89 SYSCTL_DECL(_hw_vmm);
90
91 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
92 static void devmem_destroy(void *arg);
93
94 static int
95 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
96 {
97         int error;
98
99         if (vcpu < 0 || vcpu >= VM_MAXCPU)
100                 return (EINVAL);
101
102         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
103         return (error);
104 }
105
106 static void
107 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
108 {
109         enum vcpu_state state;
110
111         state = vcpu_get_state(sc->vm, vcpu, NULL);
112         if (state != VCPU_FROZEN) {
113                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
114                     vcpu, state);
115         }
116
117         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
118 }
119
120 static int
121 vcpu_lock_all(struct vmmdev_softc *sc)
122 {
123         int error, vcpu;
124
125         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
126                 error = vcpu_lock_one(sc, vcpu);
127                 if (error)
128                         break;
129         }
130
131         if (error) {
132                 while (--vcpu >= 0)
133                         vcpu_unlock_one(sc, vcpu);
134         }
135
136         return (error);
137 }
138
139 static void
140 vcpu_unlock_all(struct vmmdev_softc *sc)
141 {
142         int vcpu;
143
144         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
145                 vcpu_unlock_one(sc, vcpu);
146 }
147
148 static struct vmmdev_softc *
149 vmmdev_lookup(const char *name)
150 {
151         struct vmmdev_softc *sc;
152
153 #ifdef notyet   /* XXX kernel is not compiled with invariants */
154         mtx_assert(&vmmdev_mtx, MA_OWNED);
155 #endif
156
157         SLIST_FOREACH(sc, &head, link) {
158                 if (strcmp(name, vm_name(sc->vm)) == 0)
159                         break;
160         }
161
162         return (sc);
163 }
164
165 static struct vmmdev_softc *
166 vmmdev_lookup2(struct cdev *cdev)
167 {
168
169         return (cdev->si_drv1);
170 }
171
172 static int
173 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
174 {
175         int error, off, c, prot;
176         vm_paddr_t gpa;
177         void *hpa, *cookie;
178         struct vmmdev_softc *sc;
179
180         sc = vmmdev_lookup2(cdev);
181         if (sc == NULL)
182                 return (ENXIO);
183
184         /*
185          * Get a read lock on the guest memory map by freezing any vcpu.
186          */
187         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
188         if (error)
189                 return (error);
190
191         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
192         while (uio->uio_resid > 0 && error == 0) {
193                 gpa = uio->uio_offset;
194                 off = gpa & PAGE_MASK;
195                 c = min(uio->uio_resid, PAGE_SIZE - off);
196
197                 /*
198                  * The VM has a hole in its physical memory map. If we want to
199                  * use 'dd' to inspect memory beyond the hole we need to
200                  * provide bogus data for memory that lies in the hole.
201                  *
202                  * Since this device does not support lseek(2), dd(1) will
203                  * read(2) blocks of data to simulate the lseek(2).
204                  */
205                 hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
206                 if (hpa == NULL) {
207                         if (uio->uio_rw == UIO_READ)
208                                 error = uiomove(__DECONST(void *, zero_region),
209                                     c, uio);
210                         else
211                                 error = EFAULT;
212                 } else {
213                         error = uiomove(hpa, c, uio);
214                         vm_gpa_release(cookie);
215                 }
216         }
217         vcpu_unlock_one(sc, VM_MAXCPU - 1);
218         return (error);
219 }
220
221 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
222
223 static int
224 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
225 {
226         struct devmem_softc *dsc;
227         int error;
228         bool sysmem;
229
230         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
231         if (error || mseg->len == 0)
232                 return (error);
233
234         if (!sysmem) {
235                 SLIST_FOREACH(dsc, &sc->devmem, link) {
236                         if (dsc->segid == mseg->segid)
237                                 break;
238                 }
239                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
240                     __func__, mseg->segid));
241                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
242         } else {
243                 bzero(mseg->name, sizeof(mseg->name));
244         }
245
246         return (error);
247 }
248
249 static int
250 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
251 {
252         char *name;
253         int error;
254         bool sysmem;
255
256         error = 0;
257         name = NULL;
258         sysmem = true;
259
260         if (VM_MEMSEG_NAME(mseg)) {
261                 sysmem = false;
262                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
263                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
264                 if (error)
265                         goto done;
266         }
267
268         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
269         if (error)
270                 goto done;
271
272         if (VM_MEMSEG_NAME(mseg)) {
273                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
274                 if (error)
275                         vm_free_memseg(sc->vm, mseg->segid);
276                 else
277                         name = NULL;    /* freed when 'cdev' is destroyed */
278         }
279 done:
280         free(name, M_VMMDEV);
281         return (error);
282 }
283
284 static int
285 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
286     uint64_t *regval)
287 {
288         int error, i;
289
290         error = 0;
291         for (i = 0; i < count; i++) {
292                 error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
293                 if (error)
294                         break;
295         }
296         return (error);
297 }
298
299 static int
300 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
301     uint64_t *regval)
302 {
303         int error, i;
304
305         error = 0;
306         for (i = 0; i < count; i++) {
307                 error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
308                 if (error)
309                         break;
310         }
311         return (error);
312 }
313
314 static int
315 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
316              struct thread *td)
317 {
318         int error, vcpu, state_changed, size;
319         cpuset_t *cpuset;
320         struct vmmdev_softc *sc;
321         struct vm_register *vmreg;
322         struct vm_seg_desc *vmsegdesc;
323         struct vm_register_set *vmregset;
324         struct vm_run *vmrun;
325         struct vm_exception *vmexc;
326         struct vm_lapic_irq *vmirq;
327         struct vm_lapic_msi *vmmsi;
328         struct vm_ioapic_irq *ioapic_irq;
329         struct vm_isa_irq *isa_irq;
330         struct vm_isa_irq_trigger *isa_irq_trigger;
331         struct vm_capability *vmcap;
332         struct vm_pptdev *pptdev;
333         struct vm_pptdev_mmio *pptmmio;
334         struct vm_pptdev_msi *pptmsi;
335         struct vm_pptdev_msix *pptmsix;
336         struct vm_nmi *vmnmi;
337         struct vm_stats *vmstats;
338         struct vm_stat_desc *statdesc;
339         struct vm_x2apic *x2apic;
340         struct vm_gpa_pte *gpapte;
341         struct vm_suspend *vmsuspend;
342         struct vm_gla2gpa *gg;
343         struct vm_activate_cpu *vac;
344         struct vm_cpuset *vm_cpuset;
345         struct vm_intinfo *vmii;
346         struct vm_rtc_time *rtctime;
347         struct vm_rtc_data *rtcdata;
348         struct vm_memmap *mm;
349         uint64_t *regvals;
350         int *regnums;
351
352         sc = vmmdev_lookup2(cdev);
353         if (sc == NULL)
354                 return (ENXIO);
355
356         error = 0;
357         vcpu = -1;
358         state_changed = 0;
359
360         /*
361          * Some VMM ioctls can operate only on vcpus that are not running.
362          */
363         switch (cmd) {
364         case VM_RUN:
365         case VM_GET_REGISTER:
366         case VM_SET_REGISTER:
367         case VM_GET_SEGMENT_DESCRIPTOR:
368         case VM_SET_SEGMENT_DESCRIPTOR:
369         case VM_GET_REGISTER_SET:
370         case VM_SET_REGISTER_SET:
371         case VM_INJECT_EXCEPTION:
372         case VM_GET_CAPABILITY:
373         case VM_SET_CAPABILITY:
374         case VM_PPTDEV_MSI:
375         case VM_PPTDEV_MSIX:
376         case VM_SET_X2APIC_STATE:
377         case VM_GLA2GPA:
378         case VM_ACTIVATE_CPU:
379         case VM_SET_INTINFO:
380         case VM_GET_INTINFO:
381         case VM_RESTART_INSTRUCTION:
382                 /*
383                  * XXX fragile, handle with care
384                  * Assumes that the first field of the ioctl data is the vcpu.
385                  */
386                 vcpu = *(int *)data;
387                 error = vcpu_lock_one(sc, vcpu);
388                 if (error)
389                         goto done;
390                 state_changed = 1;
391                 break;
392
393         case VM_MAP_PPTDEV_MMIO:
394         case VM_BIND_PPTDEV:
395         case VM_UNBIND_PPTDEV:
396         case VM_ALLOC_MEMSEG:
397         case VM_MMAP_MEMSEG:
398         case VM_REINIT:
399                 /*
400                  * ioctls that operate on the entire virtual machine must
401                  * prevent all vcpus from running.
402                  */
403                 error = vcpu_lock_all(sc);
404                 if (error)
405                         goto done;
406                 state_changed = 2;
407                 break;
408
409         case VM_GET_MEMSEG:
410         case VM_MMAP_GETNEXT:
411                 /*
412                  * Lock a vcpu to make sure that the memory map cannot be
413                  * modified while it is being inspected.
414                  */
415                 vcpu = VM_MAXCPU - 1;
416                 error = vcpu_lock_one(sc, vcpu);
417                 if (error)
418                         goto done;
419                 state_changed = 1;
420                 break;
421
422         default:
423                 break;
424         }
425
426         switch(cmd) {
427         case VM_RUN:
428                 vmrun = (struct vm_run *)data;
429                 error = vm_run(sc->vm, vmrun);
430                 break;
431         case VM_SUSPEND:
432                 vmsuspend = (struct vm_suspend *)data;
433                 error = vm_suspend(sc->vm, vmsuspend->how);
434                 break;
435         case VM_REINIT:
436                 error = vm_reinit(sc->vm);
437                 break;
438         case VM_STAT_DESC: {
439                 statdesc = (struct vm_stat_desc *)data;
440                 error = vmm_stat_desc_copy(statdesc->index,
441                                         statdesc->desc, sizeof(statdesc->desc));
442                 break;
443         }
444         case VM_STATS: {
445                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
446                 vmstats = (struct vm_stats *)data;
447                 getmicrotime(&vmstats->tv);
448                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
449                                       &vmstats->num_entries, vmstats->statbuf);
450                 break;
451         }
452         case VM_PPTDEV_MSI:
453                 pptmsi = (struct vm_pptdev_msi *)data;
454                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
455                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
456                                       pptmsi->addr, pptmsi->msg,
457                                       pptmsi->numvec);
458                 break;
459         case VM_PPTDEV_MSIX:
460                 pptmsix = (struct vm_pptdev_msix *)data;
461                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
462                                        pptmsix->bus, pptmsix->slot, 
463                                        pptmsix->func, pptmsix->idx,
464                                        pptmsix->addr, pptmsix->msg,
465                                        pptmsix->vector_control);
466                 break;
467         case VM_MAP_PPTDEV_MMIO:
468                 pptmmio = (struct vm_pptdev_mmio *)data;
469                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
470                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
471                                      pptmmio->hpa);
472                 break;
473         case VM_BIND_PPTDEV:
474                 pptdev = (struct vm_pptdev *)data;
475                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
476                                          pptdev->func);
477                 break;
478         case VM_UNBIND_PPTDEV:
479                 pptdev = (struct vm_pptdev *)data;
480                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
481                                            pptdev->func);
482                 break;
483         case VM_INJECT_EXCEPTION:
484                 vmexc = (struct vm_exception *)data;
485                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
486                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
487                     vmexc->restart_instruction);
488                 break;
489         case VM_INJECT_NMI:
490                 vmnmi = (struct vm_nmi *)data;
491                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
492                 break;
493         case VM_LAPIC_IRQ:
494                 vmirq = (struct vm_lapic_irq *)data;
495                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
496                 break;
497         case VM_LAPIC_LOCAL_IRQ:
498                 vmirq = (struct vm_lapic_irq *)data;
499                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
500                     vmirq->vector);
501                 break;
502         case VM_LAPIC_MSI:
503                 vmmsi = (struct vm_lapic_msi *)data;
504                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
505                 break;
506         case VM_IOAPIC_ASSERT_IRQ:
507                 ioapic_irq = (struct vm_ioapic_irq *)data;
508                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
509                 break;
510         case VM_IOAPIC_DEASSERT_IRQ:
511                 ioapic_irq = (struct vm_ioapic_irq *)data;
512                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
513                 break;
514         case VM_IOAPIC_PULSE_IRQ:
515                 ioapic_irq = (struct vm_ioapic_irq *)data;
516                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
517                 break;
518         case VM_IOAPIC_PINCOUNT:
519                 *(int *)data = vioapic_pincount(sc->vm);
520                 break;
521         case VM_ISA_ASSERT_IRQ:
522                 isa_irq = (struct vm_isa_irq *)data;
523                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
524                 if (error == 0 && isa_irq->ioapic_irq != -1)
525                         error = vioapic_assert_irq(sc->vm,
526                             isa_irq->ioapic_irq);
527                 break;
528         case VM_ISA_DEASSERT_IRQ:
529                 isa_irq = (struct vm_isa_irq *)data;
530                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
531                 if (error == 0 && isa_irq->ioapic_irq != -1)
532                         error = vioapic_deassert_irq(sc->vm,
533                             isa_irq->ioapic_irq);
534                 break;
535         case VM_ISA_PULSE_IRQ:
536                 isa_irq = (struct vm_isa_irq *)data;
537                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
538                 if (error == 0 && isa_irq->ioapic_irq != -1)
539                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
540                 break;
541         case VM_ISA_SET_IRQ_TRIGGER:
542                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
543                 error = vatpic_set_irq_trigger(sc->vm,
544                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
545                 break;
546         case VM_MMAP_GETNEXT:
547                 mm = (struct vm_memmap *)data;
548                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
549                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
550                 break;
551         case VM_MMAP_MEMSEG:
552                 mm = (struct vm_memmap *)data;
553                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
554                     mm->len, mm->prot, mm->flags);
555                 break;
556         case VM_ALLOC_MEMSEG:
557                 error = alloc_memseg(sc, (struct vm_memseg *)data);
558                 break;
559         case VM_GET_MEMSEG:
560                 error = get_memseg(sc, (struct vm_memseg *)data);
561                 break;
562         case VM_GET_REGISTER:
563                 vmreg = (struct vm_register *)data;
564                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
565                                         &vmreg->regval);
566                 break;
567         case VM_SET_REGISTER:
568                 vmreg = (struct vm_register *)data;
569                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
570                                         vmreg->regval);
571                 break;
572         case VM_SET_SEGMENT_DESCRIPTOR:
573                 vmsegdesc = (struct vm_seg_desc *)data;
574                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
575                                         vmsegdesc->regnum,
576                                         &vmsegdesc->desc);
577                 break;
578         case VM_GET_SEGMENT_DESCRIPTOR:
579                 vmsegdesc = (struct vm_seg_desc *)data;
580                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
581                                         vmsegdesc->regnum,
582                                         &vmsegdesc->desc);
583                 break;
584         case VM_GET_REGISTER_SET:
585                 vmregset = (struct vm_register_set *)data;
586                 if (vmregset->count > VM_REG_LAST) {
587                         error = EINVAL;
588                         break;
589                 }
590                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
591                     M_WAITOK);
592                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
593                     M_WAITOK);
594                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
595                     vmregset->count);
596                 if (error == 0)
597                         error = vm_get_register_set(sc->vm, vmregset->cpuid,
598                             vmregset->count, regnums, regvals);
599                 if (error == 0)
600                         error = copyout(regvals, vmregset->regvals,
601                             sizeof(regvals[0]) * vmregset->count);
602                 free(regvals, M_VMMDEV);
603                 free(regnums, M_VMMDEV);
604                 break;
605         case VM_SET_REGISTER_SET:
606                 vmregset = (struct vm_register_set *)data;
607                 if (vmregset->count > VM_REG_LAST) {
608                         error = EINVAL;
609                         break;
610                 }
611                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
612                     M_WAITOK);
613                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
614                     M_WAITOK);
615                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
616                     vmregset->count);
617                 if (error == 0)
618                         error = copyin(vmregset->regvals, regvals,
619                             sizeof(regvals[0]) * vmregset->count);
620                 if (error == 0)
621                         error = vm_set_register_set(sc->vm, vmregset->cpuid,
622                             vmregset->count, regnums, regvals);
623                 free(regvals, M_VMMDEV);
624                 free(regnums, M_VMMDEV);
625                 break;
626         case VM_GET_CAPABILITY:
627                 vmcap = (struct vm_capability *)data;
628                 error = vm_get_capability(sc->vm, vmcap->cpuid,
629                                           vmcap->captype,
630                                           &vmcap->capval);
631                 break;
632         case VM_SET_CAPABILITY:
633                 vmcap = (struct vm_capability *)data;
634                 error = vm_set_capability(sc->vm, vmcap->cpuid,
635                                           vmcap->captype,
636                                           vmcap->capval);
637                 break;
638         case VM_SET_X2APIC_STATE:
639                 x2apic = (struct vm_x2apic *)data;
640                 error = vm_set_x2apic_state(sc->vm,
641                                             x2apic->cpuid, x2apic->state);
642                 break;
643         case VM_GET_X2APIC_STATE:
644                 x2apic = (struct vm_x2apic *)data;
645                 error = vm_get_x2apic_state(sc->vm,
646                                             x2apic->cpuid, &x2apic->state);
647                 break;
648         case VM_GET_GPA_PMAP:
649                 gpapte = (struct vm_gpa_pte *)data;
650                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
651                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
652                 error = 0;
653                 break;
654         case VM_GET_HPET_CAPABILITIES:
655                 error = vhpet_getcap((struct vm_hpet_cap *)data);
656                 break;
657         case VM_GLA2GPA: {
658                 CTASSERT(PROT_READ == VM_PROT_READ);
659                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
660                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
661                 gg = (struct vm_gla2gpa *)data;
662                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
663                     gg->prot, &gg->gpa, &gg->fault);
664                 KASSERT(error == 0 || error == EFAULT,
665                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
666                 break;
667         }
668         case VM_ACTIVATE_CPU:
669                 vac = (struct vm_activate_cpu *)data;
670                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
671                 break;
672         case VM_GET_CPUS:
673                 error = 0;
674                 vm_cpuset = (struct vm_cpuset *)data;
675                 size = vm_cpuset->cpusetsize;
676                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
677                         error = ERANGE;
678                         break;
679                 }
680                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
681                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
682                         *cpuset = vm_active_cpus(sc->vm);
683                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
684                         *cpuset = vm_suspended_cpus(sc->vm);
685                 else
686                         error = EINVAL;
687                 if (error == 0)
688                         error = copyout(cpuset, vm_cpuset->cpus, size);
689                 free(cpuset, M_TEMP);
690                 break;
691         case VM_SET_INTINFO:
692                 vmii = (struct vm_intinfo *)data;
693                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
694                 break;
695         case VM_GET_INTINFO:
696                 vmii = (struct vm_intinfo *)data;
697                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
698                     &vmii->info2);
699                 break;
700         case VM_RTC_WRITE:
701                 rtcdata = (struct vm_rtc_data *)data;
702                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
703                     rtcdata->value);
704                 break;
705         case VM_RTC_READ:
706                 rtcdata = (struct vm_rtc_data *)data;
707                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
708                     &rtcdata->value);
709                 break;
710         case VM_RTC_SETTIME:
711                 rtctime = (struct vm_rtc_time *)data;
712                 error = vrtc_set_time(sc->vm, rtctime->secs);
713                 break;
714         case VM_RTC_GETTIME:
715                 error = 0;
716                 rtctime = (struct vm_rtc_time *)data;
717                 rtctime->secs = vrtc_get_time(sc->vm);
718                 break;
719         case VM_RESTART_INSTRUCTION:
720                 error = vm_restart_instruction(sc->vm, vcpu);
721                 break;
722         default:
723                 error = ENOTTY;
724                 break;
725         }
726
727         if (state_changed == 1)
728                 vcpu_unlock_one(sc, vcpu);
729         else if (state_changed == 2)
730                 vcpu_unlock_all(sc);
731
732 done:
733         /* Make sure that no handler returns a bogus value like ERESTART */
734         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
735         return (error);
736 }
737
738 static int
739 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
740     struct vm_object **objp, int nprot)
741 {
742         struct vmmdev_softc *sc;
743         vm_paddr_t gpa;
744         size_t len;
745         vm_ooffset_t segoff, first, last;
746         int error, found, segid;
747         bool sysmem;
748
749         first = *offset;
750         last = first + mapsize;
751         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
752                 return (EINVAL);
753
754         sc = vmmdev_lookup2(cdev);
755         if (sc == NULL) {
756                 /* virtual machine is in the process of being created */
757                 return (EINVAL);
758         }
759
760         /*
761          * Get a read lock on the guest memory map by freezing any vcpu.
762          */
763         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
764         if (error)
765                 return (error);
766
767         gpa = 0;
768         found = 0;
769         while (!found) {
770                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
771                     NULL, NULL);
772                 if (error)
773                         break;
774
775                 if (first >= gpa && last <= gpa + len)
776                         found = 1;
777                 else
778                         gpa += len;
779         }
780
781         if (found) {
782                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
783                 KASSERT(error == 0 && *objp != NULL,
784                     ("%s: invalid memory segment %d", __func__, segid));
785                 if (sysmem) {
786                         vm_object_reference(*objp);
787                         *offset = segoff + (first - gpa);
788                 } else {
789                         error = EINVAL;
790                 }
791         }
792         vcpu_unlock_one(sc, VM_MAXCPU - 1);
793         return (error);
794 }
795
796 static void
797 vmmdev_destroy(void *arg)
798 {
799         struct vmmdev_softc *sc = arg;
800         struct devmem_softc *dsc;
801         int error;
802
803         error = vcpu_lock_all(sc);
804         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
805
806         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
807                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
808                 SLIST_REMOVE_HEAD(&sc->devmem, link);
809                 free(dsc->name, M_VMMDEV);
810                 free(dsc, M_VMMDEV);
811         }
812
813         if (sc->cdev != NULL)
814                 destroy_dev(sc->cdev);
815
816         if (sc->vm != NULL)
817                 vm_destroy(sc->vm);
818
819         if ((sc->flags & VSC_LINKED) != 0) {
820                 mtx_lock(&vmmdev_mtx);
821                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
822                 mtx_unlock(&vmmdev_mtx);
823         }
824
825         free(sc, M_VMMDEV);
826 }
827
828 static int
829 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
830 {
831         int error;
832         char buf[VM_MAX_NAMELEN];
833         struct devmem_softc *dsc;
834         struct vmmdev_softc *sc;
835         struct cdev *cdev;
836
837         strlcpy(buf, "beavis", sizeof(buf));
838         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
839         if (error != 0 || req->newptr == NULL)
840                 return (error);
841
842         mtx_lock(&vmmdev_mtx);
843         sc = vmmdev_lookup(buf);
844         if (sc == NULL || sc->cdev == NULL) {
845                 mtx_unlock(&vmmdev_mtx);
846                 return (EINVAL);
847         }
848
849         /*
850          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
851          * goes down to 0 so we should not do it again in the callback.
852          *
853          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
854          * is scheduled for destruction.
855          */
856         cdev = sc->cdev;
857         sc->cdev = NULL;                
858         mtx_unlock(&vmmdev_mtx);
859
860         /*
861          * Schedule all cdevs to be destroyed:
862          *
863          * - any new operations on the 'cdev' will return an error (ENXIO).
864          *
865          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
866          *   be destroyed and the callback will be invoked in a taskqueue
867          *   context.
868          *
869          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
870          */
871         SLIST_FOREACH(dsc, &sc->devmem, link) {
872                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
873                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
874         }
875         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
876         return (0);
877 }
878 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
879             NULL, 0, sysctl_vmm_destroy, "A", NULL);
880
881 static struct cdevsw vmmdevsw = {
882         .d_name         = "vmmdev",
883         .d_version      = D_VERSION,
884         .d_ioctl        = vmmdev_ioctl,
885         .d_mmap_single  = vmmdev_mmap_single,
886         .d_read         = vmmdev_rw,
887         .d_write        = vmmdev_rw,
888 };
889
890 static int
891 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
892 {
893         int error;
894         struct vm *vm;
895         struct cdev *cdev;
896         struct vmmdev_softc *sc, *sc2;
897         char buf[VM_MAX_NAMELEN];
898
899         strlcpy(buf, "beavis", sizeof(buf));
900         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
901         if (error != 0 || req->newptr == NULL)
902                 return (error);
903
904         mtx_lock(&vmmdev_mtx);
905         sc = vmmdev_lookup(buf);
906         mtx_unlock(&vmmdev_mtx);
907         if (sc != NULL)
908                 return (EEXIST);
909
910         error = vm_create(buf, &vm);
911         if (error != 0)
912                 return (error);
913
914         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
915         sc->vm = vm;
916         SLIST_INIT(&sc->devmem);
917
918         /*
919          * Lookup the name again just in case somebody sneaked in when we
920          * dropped the lock.
921          */
922         mtx_lock(&vmmdev_mtx);
923         sc2 = vmmdev_lookup(buf);
924         if (sc2 == NULL) {
925                 SLIST_INSERT_HEAD(&head, sc, link);
926                 sc->flags |= VSC_LINKED;
927         }
928         mtx_unlock(&vmmdev_mtx);
929
930         if (sc2 != NULL) {
931                 vmmdev_destroy(sc);
932                 return (EEXIST);
933         }
934
935         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
936                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
937         if (error != 0) {
938                 vmmdev_destroy(sc);
939                 return (error);
940         }
941
942         mtx_lock(&vmmdev_mtx);
943         sc->cdev = cdev;
944         sc->cdev->si_drv1 = sc;
945         mtx_unlock(&vmmdev_mtx);
946
947         return (0);
948 }
949 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
950             NULL, 0, sysctl_vmm_create, "A", NULL);
951
952 void
953 vmmdev_init(void)
954 {
955         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
956 }
957
958 int
959 vmmdev_cleanup(void)
960 {
961         int error;
962
963         if (SLIST_EMPTY(&head))
964                 error = 0;
965         else
966                 error = EBUSY;
967
968         return (error);
969 }
970
971 static int
972 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
973     struct vm_object **objp, int nprot)
974 {
975         struct devmem_softc *dsc;
976         vm_ooffset_t first, last;
977         size_t seglen;
978         int error;
979         bool sysmem;
980
981         dsc = cdev->si_drv1;
982         if (dsc == NULL) {
983                 /* 'cdev' has been created but is not ready for use */
984                 return (ENXIO);
985         }
986
987         first = *offset;
988         last = *offset + len;
989         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
990                 return (EINVAL);
991
992         error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
993         if (error)
994                 return (error);
995
996         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
997         KASSERT(error == 0 && !sysmem && *objp != NULL,
998             ("%s: invalid devmem segment %d", __func__, dsc->segid));
999
1000         vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
1001
1002         if (seglen >= last) {
1003                 vm_object_reference(*objp);
1004                 return (0);
1005         } else {
1006                 return (EINVAL);
1007         }
1008 }
1009
1010 static struct cdevsw devmemsw = {
1011         .d_name         = "devmem",
1012         .d_version      = D_VERSION,
1013         .d_mmap_single  = devmem_mmap_single,
1014 };
1015
1016 static int
1017 devmem_create_cdev(const char *vmname, int segid, char *devname)
1018 {
1019         struct devmem_softc *dsc;
1020         struct vmmdev_softc *sc;
1021         struct cdev *cdev;
1022         int error;
1023
1024         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1025             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1026         if (error)
1027                 return (error);
1028
1029         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1030
1031         mtx_lock(&vmmdev_mtx);
1032         sc = vmmdev_lookup(vmname);
1033         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1034         if (sc->cdev == NULL) {
1035                 /* virtual machine is being created or destroyed */
1036                 mtx_unlock(&vmmdev_mtx);
1037                 free(dsc, M_VMMDEV);
1038                 destroy_dev_sched_cb(cdev, NULL, 0);
1039                 return (ENODEV);
1040         }
1041
1042         dsc->segid = segid;
1043         dsc->name = devname;
1044         dsc->cdev = cdev;
1045         dsc->sc = sc;
1046         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1047         mtx_unlock(&vmmdev_mtx);
1048
1049         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
1050         cdev->si_drv1 = dsc;
1051         return (0);
1052 }
1053
1054 static void
1055 devmem_destroy(void *arg)
1056 {
1057         struct devmem_softc *dsc = arg;
1058
1059         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1060         dsc->cdev = NULL;
1061         dsc->sc = NULL;
1062 }