]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
Implement pci_enable_msi() and pci_disable_msi() in the LinuxKPI.
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/jail.h>
37 #include <sys/queue.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/malloc.h>
41 #include <sys/conf.h>
42 #include <sys/sysctl.h>
43 #include <sys/libkern.h>
44 #include <sys/ioccom.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 #include <sys/proc.h>
48
49 #include <vm/vm.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_object.h>
53
54 #include <machine/vmparam.h>
55 #include <machine/vmm.h>
56 #include <machine/vmm_instruction_emul.h>
57 #include <machine/vmm_dev.h>
58
59 #include "vmm_lapic.h"
60 #include "vmm_stat.h"
61 #include "vmm_mem.h"
62 #include "io/ppt.h"
63 #include "io/vatpic.h"
64 #include "io/vioapic.h"
65 #include "io/vhpet.h"
66 #include "io/vrtc.h"
67
68 struct devmem_softc {
69         int     segid;
70         char    *name;
71         struct cdev *cdev;
72         struct vmmdev_softc *sc;
73         SLIST_ENTRY(devmem_softc) link;
74 };
75
76 struct vmmdev_softc {
77         struct vm       *vm;            /* vm instance cookie */
78         struct cdev     *cdev;
79         SLIST_ENTRY(vmmdev_softc) link;
80         SLIST_HEAD(, devmem_softc) devmem;
81         int             flags;
82 };
83 #define VSC_LINKED              0x01
84
85 static SLIST_HEAD(, vmmdev_softc) head;
86
87 static unsigned pr_allow_flag;
88 static struct mtx vmmdev_mtx;
89
90 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
91
92 SYSCTL_DECL(_hw_vmm);
93
94 static int vmm_priv_check(struct ucred *ucred);
95 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
96 static void devmem_destroy(void *arg);
97
98 static int
99 vmm_priv_check(struct ucred *ucred)
100 {
101
102         if (jailed(ucred) &&
103             !(ucred->cr_prison->pr_allow & pr_allow_flag))
104                 return (EPERM);
105
106         return (0);
107 }
108
109 static int
110 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
111 {
112         int error;
113
114         if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
115                 return (EINVAL);
116
117         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
118         return (error);
119 }
120
121 static void
122 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
123 {
124         enum vcpu_state state;
125
126         state = vcpu_get_state(sc->vm, vcpu, NULL);
127         if (state != VCPU_FROZEN) {
128                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
129                     vcpu, state);
130         }
131
132         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
133 }
134
135 static int
136 vcpu_lock_all(struct vmmdev_softc *sc)
137 {
138         int error, vcpu;
139         uint16_t maxcpus;
140
141         maxcpus = vm_get_maxcpus(sc->vm);
142         for (vcpu = 0; vcpu < maxcpus; vcpu++) {
143                 error = vcpu_lock_one(sc, vcpu);
144                 if (error)
145                         break;
146         }
147
148         if (error) {
149                 while (--vcpu >= 0)
150                         vcpu_unlock_one(sc, vcpu);
151         }
152
153         return (error);
154 }
155
156 static void
157 vcpu_unlock_all(struct vmmdev_softc *sc)
158 {
159         int vcpu;
160         uint16_t maxcpus;
161
162         maxcpus = vm_get_maxcpus(sc->vm);
163         for (vcpu = 0; vcpu < maxcpus; vcpu++)
164                 vcpu_unlock_one(sc, vcpu);
165 }
166
167 static struct vmmdev_softc *
168 vmmdev_lookup(const char *name)
169 {
170         struct vmmdev_softc *sc;
171
172 #ifdef notyet   /* XXX kernel is not compiled with invariants */
173         mtx_assert(&vmmdev_mtx, MA_OWNED);
174 #endif
175
176         SLIST_FOREACH(sc, &head, link) {
177                 if (strcmp(name, vm_name(sc->vm)) == 0)
178                         break;
179         }
180
181         return (sc);
182 }
183
184 static struct vmmdev_softc *
185 vmmdev_lookup2(struct cdev *cdev)
186 {
187
188         return (cdev->si_drv1);
189 }
190
191 static int
192 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
193 {
194         int error, off, c, prot;
195         vm_paddr_t gpa, maxaddr;
196         void *hpa, *cookie;
197         struct vmmdev_softc *sc;
198         uint16_t lastcpu;
199
200         error = vmm_priv_check(curthread->td_ucred);
201         if (error)
202                 return (error);
203
204         sc = vmmdev_lookup2(cdev);
205         if (sc == NULL)
206                 return (ENXIO);
207
208         /*
209          * Get a read lock on the guest memory map by freezing any vcpu.
210          */
211         lastcpu = vm_get_maxcpus(sc->vm) - 1;
212         error = vcpu_lock_one(sc, lastcpu);
213         if (error)
214                 return (error);
215
216         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
217         maxaddr = vmm_sysmem_maxaddr(sc->vm);
218         while (uio->uio_resid > 0 && error == 0) {
219                 gpa = uio->uio_offset;
220                 off = gpa & PAGE_MASK;
221                 c = min(uio->uio_resid, PAGE_SIZE - off);
222
223                 /*
224                  * The VM has a hole in its physical memory map. If we want to
225                  * use 'dd' to inspect memory beyond the hole we need to
226                  * provide bogus data for memory that lies in the hole.
227                  *
228                  * Since this device does not support lseek(2), dd(1) will
229                  * read(2) blocks of data to simulate the lseek(2).
230                  */
231                 hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
232                     prot, &cookie);
233                 if (hpa == NULL) {
234                         if (uio->uio_rw == UIO_READ && gpa < maxaddr)
235                                 error = uiomove(__DECONST(void *, zero_region),
236                                     c, uio);
237                         else
238                                 error = EFAULT;
239                 } else {
240                         error = uiomove(hpa, c, uio);
241                         vm_gpa_release(cookie);
242                 }
243         }
244         vcpu_unlock_one(sc, lastcpu);
245         return (error);
246 }
247
248 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
249
250 static int
251 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
252 {
253         struct devmem_softc *dsc;
254         int error;
255         bool sysmem;
256
257         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
258         if (error || mseg->len == 0)
259                 return (error);
260
261         if (!sysmem) {
262                 SLIST_FOREACH(dsc, &sc->devmem, link) {
263                         if (dsc->segid == mseg->segid)
264                                 break;
265                 }
266                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
267                     __func__, mseg->segid));
268                 error = copystr(dsc->name, mseg->name, sizeof(mseg->name),
269                     NULL);
270         } else {
271                 bzero(mseg->name, sizeof(mseg->name));
272         }
273
274         return (error);
275 }
276
277 static int
278 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
279 {
280         char *name;
281         int error;
282         bool sysmem;
283
284         error = 0;
285         name = NULL;
286         sysmem = true;
287
288         /*
289          * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
290          * by stripped off when devfs processes the full string.
291          */
292         if (VM_MEMSEG_NAME(mseg)) {
293                 sysmem = false;
294                 name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK);
295                 error = copystr(mseg->name, name, sizeof(mseg->name), NULL);
296                 if (error)
297                         goto done;
298         }
299
300         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
301         if (error)
302                 goto done;
303
304         if (VM_MEMSEG_NAME(mseg)) {
305                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
306                 if (error)
307                         vm_free_memseg(sc->vm, mseg->segid);
308                 else
309                         name = NULL;    /* freed when 'cdev' is destroyed */
310         }
311 done:
312         free(name, M_VMMDEV);
313         return (error);
314 }
315
316 static int
317 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
318     uint64_t *regval)
319 {
320         int error, i;
321
322         error = 0;
323         for (i = 0; i < count; i++) {
324                 error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
325                 if (error)
326                         break;
327         }
328         return (error);
329 }
330
331 static int
332 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
333     uint64_t *regval)
334 {
335         int error, i;
336
337         error = 0;
338         for (i = 0; i < count; i++) {
339                 error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
340                 if (error)
341                         break;
342         }
343         return (error);
344 }
345
346 static int
347 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
348              struct thread *td)
349 {
350         int error, vcpu, state_changed, size;
351         cpuset_t *cpuset;
352         struct vmmdev_softc *sc;
353         struct vm_register *vmreg;
354         struct vm_seg_desc *vmsegdesc;
355         struct vm_register_set *vmregset;
356         struct vm_run *vmrun;
357         struct vm_exception *vmexc;
358         struct vm_lapic_irq *vmirq;
359         struct vm_lapic_msi *vmmsi;
360         struct vm_ioapic_irq *ioapic_irq;
361         struct vm_isa_irq *isa_irq;
362         struct vm_isa_irq_trigger *isa_irq_trigger;
363         struct vm_capability *vmcap;
364         struct vm_pptdev *pptdev;
365         struct vm_pptdev_mmio *pptmmio;
366         struct vm_pptdev_msi *pptmsi;
367         struct vm_pptdev_msix *pptmsix;
368         struct vm_nmi *vmnmi;
369         struct vm_stats *vmstats;
370         struct vm_stat_desc *statdesc;
371         struct vm_x2apic *x2apic;
372         struct vm_gpa_pte *gpapte;
373         struct vm_suspend *vmsuspend;
374         struct vm_gla2gpa *gg;
375         struct vm_activate_cpu *vac;
376         struct vm_cpuset *vm_cpuset;
377         struct vm_intinfo *vmii;
378         struct vm_rtc_time *rtctime;
379         struct vm_rtc_data *rtcdata;
380         struct vm_memmap *mm;
381         struct vm_cpu_topology *topology;
382         uint64_t *regvals;
383         int *regnums;
384
385         error = vmm_priv_check(curthread->td_ucred);
386         if (error)
387                 return (error);
388
389         sc = vmmdev_lookup2(cdev);
390         if (sc == NULL)
391                 return (ENXIO);
392
393         vcpu = -1;
394         state_changed = 0;
395
396         /*
397          * Some VMM ioctls can operate only on vcpus that are not running.
398          */
399         switch (cmd) {
400         case VM_RUN:
401         case VM_GET_REGISTER:
402         case VM_SET_REGISTER:
403         case VM_GET_SEGMENT_DESCRIPTOR:
404         case VM_SET_SEGMENT_DESCRIPTOR:
405         case VM_GET_REGISTER_SET:
406         case VM_SET_REGISTER_SET:
407         case VM_INJECT_EXCEPTION:
408         case VM_GET_CAPABILITY:
409         case VM_SET_CAPABILITY:
410         case VM_PPTDEV_MSI:
411         case VM_PPTDEV_MSIX:
412         case VM_SET_X2APIC_STATE:
413         case VM_GLA2GPA:
414         case VM_GLA2GPA_NOFAULT:
415         case VM_ACTIVATE_CPU:
416         case VM_SET_INTINFO:
417         case VM_GET_INTINFO:
418         case VM_RESTART_INSTRUCTION:
419                 /*
420                  * XXX fragile, handle with care
421                  * Assumes that the first field of the ioctl data is the vcpu.
422                  */
423                 vcpu = *(int *)data;
424                 error = vcpu_lock_one(sc, vcpu);
425                 if (error)
426                         goto done;
427                 state_changed = 1;
428                 break;
429
430         case VM_MAP_PPTDEV_MMIO:
431         case VM_BIND_PPTDEV:
432         case VM_UNBIND_PPTDEV:
433         case VM_ALLOC_MEMSEG:
434         case VM_MMAP_MEMSEG:
435         case VM_REINIT:
436                 /*
437                  * ioctls that operate on the entire virtual machine must
438                  * prevent all vcpus from running.
439                  */
440                 error = vcpu_lock_all(sc);
441                 if (error)
442                         goto done;
443                 state_changed = 2;
444                 break;
445
446         case VM_GET_MEMSEG:
447         case VM_MMAP_GETNEXT:
448                 /*
449                  * Lock a vcpu to make sure that the memory map cannot be
450                  * modified while it is being inspected.
451                  */
452                 vcpu = vm_get_maxcpus(sc->vm) - 1;
453                 error = vcpu_lock_one(sc, vcpu);
454                 if (error)
455                         goto done;
456                 state_changed = 1;
457                 break;
458
459         default:
460                 break;
461         }
462
463         switch(cmd) {
464         case VM_RUN:
465                 vmrun = (struct vm_run *)data;
466                 error = vm_run(sc->vm, vmrun);
467                 break;
468         case VM_SUSPEND:
469                 vmsuspend = (struct vm_suspend *)data;
470                 error = vm_suspend(sc->vm, vmsuspend->how);
471                 break;
472         case VM_REINIT:
473                 error = vm_reinit(sc->vm);
474                 break;
475         case VM_STAT_DESC: {
476                 statdesc = (struct vm_stat_desc *)data;
477                 error = vmm_stat_desc_copy(statdesc->index,
478                                         statdesc->desc, sizeof(statdesc->desc));
479                 break;
480         }
481         case VM_STATS: {
482                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
483                 vmstats = (struct vm_stats *)data;
484                 getmicrotime(&vmstats->tv);
485                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
486                                       &vmstats->num_entries, vmstats->statbuf);
487                 break;
488         }
489         case VM_PPTDEV_MSI:
490                 pptmsi = (struct vm_pptdev_msi *)data;
491                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
492                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
493                                       pptmsi->addr, pptmsi->msg,
494                                       pptmsi->numvec);
495                 break;
496         case VM_PPTDEV_MSIX:
497                 pptmsix = (struct vm_pptdev_msix *)data;
498                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
499                                        pptmsix->bus, pptmsix->slot, 
500                                        pptmsix->func, pptmsix->idx,
501                                        pptmsix->addr, pptmsix->msg,
502                                        pptmsix->vector_control);
503                 break;
504         case VM_MAP_PPTDEV_MMIO:
505                 pptmmio = (struct vm_pptdev_mmio *)data;
506                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
507                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
508                                      pptmmio->hpa);
509                 break;
510         case VM_BIND_PPTDEV:
511                 pptdev = (struct vm_pptdev *)data;
512                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
513                                          pptdev->func);
514                 break;
515         case VM_UNBIND_PPTDEV:
516                 pptdev = (struct vm_pptdev *)data;
517                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
518                                            pptdev->func);
519                 break;
520         case VM_INJECT_EXCEPTION:
521                 vmexc = (struct vm_exception *)data;
522                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
523                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
524                     vmexc->restart_instruction);
525                 break;
526         case VM_INJECT_NMI:
527                 vmnmi = (struct vm_nmi *)data;
528                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
529                 break;
530         case VM_LAPIC_IRQ:
531                 vmirq = (struct vm_lapic_irq *)data;
532                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
533                 break;
534         case VM_LAPIC_LOCAL_IRQ:
535                 vmirq = (struct vm_lapic_irq *)data;
536                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
537                     vmirq->vector);
538                 break;
539         case VM_LAPIC_MSI:
540                 vmmsi = (struct vm_lapic_msi *)data;
541                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
542                 break;
543         case VM_IOAPIC_ASSERT_IRQ:
544                 ioapic_irq = (struct vm_ioapic_irq *)data;
545                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
546                 break;
547         case VM_IOAPIC_DEASSERT_IRQ:
548                 ioapic_irq = (struct vm_ioapic_irq *)data;
549                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
550                 break;
551         case VM_IOAPIC_PULSE_IRQ:
552                 ioapic_irq = (struct vm_ioapic_irq *)data;
553                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
554                 break;
555         case VM_IOAPIC_PINCOUNT:
556                 *(int *)data = vioapic_pincount(sc->vm);
557                 break;
558         case VM_ISA_ASSERT_IRQ:
559                 isa_irq = (struct vm_isa_irq *)data;
560                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
561                 if (error == 0 && isa_irq->ioapic_irq != -1)
562                         error = vioapic_assert_irq(sc->vm,
563                             isa_irq->ioapic_irq);
564                 break;
565         case VM_ISA_DEASSERT_IRQ:
566                 isa_irq = (struct vm_isa_irq *)data;
567                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
568                 if (error == 0 && isa_irq->ioapic_irq != -1)
569                         error = vioapic_deassert_irq(sc->vm,
570                             isa_irq->ioapic_irq);
571                 break;
572         case VM_ISA_PULSE_IRQ:
573                 isa_irq = (struct vm_isa_irq *)data;
574                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
575                 if (error == 0 && isa_irq->ioapic_irq != -1)
576                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
577                 break;
578         case VM_ISA_SET_IRQ_TRIGGER:
579                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
580                 error = vatpic_set_irq_trigger(sc->vm,
581                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
582                 break;
583         case VM_MMAP_GETNEXT:
584                 mm = (struct vm_memmap *)data;
585                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
586                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
587                 break;
588         case VM_MMAP_MEMSEG:
589                 mm = (struct vm_memmap *)data;
590                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
591                     mm->len, mm->prot, mm->flags);
592                 break;
593         case VM_ALLOC_MEMSEG:
594                 error = alloc_memseg(sc, (struct vm_memseg *)data);
595                 break;
596         case VM_GET_MEMSEG:
597                 error = get_memseg(sc, (struct vm_memseg *)data);
598                 break;
599         case VM_GET_REGISTER:
600                 vmreg = (struct vm_register *)data;
601                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
602                                         &vmreg->regval);
603                 break;
604         case VM_SET_REGISTER:
605                 vmreg = (struct vm_register *)data;
606                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
607                                         vmreg->regval);
608                 break;
609         case VM_SET_SEGMENT_DESCRIPTOR:
610                 vmsegdesc = (struct vm_seg_desc *)data;
611                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
612                                         vmsegdesc->regnum,
613                                         &vmsegdesc->desc);
614                 break;
615         case VM_GET_SEGMENT_DESCRIPTOR:
616                 vmsegdesc = (struct vm_seg_desc *)data;
617                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
618                                         vmsegdesc->regnum,
619                                         &vmsegdesc->desc);
620                 break;
621         case VM_GET_REGISTER_SET:
622                 vmregset = (struct vm_register_set *)data;
623                 if (vmregset->count > VM_REG_LAST) {
624                         error = EINVAL;
625                         break;
626                 }
627                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
628                     M_WAITOK);
629                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
630                     M_WAITOK);
631                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
632                     vmregset->count);
633                 if (error == 0)
634                         error = vm_get_register_set(sc->vm, vmregset->cpuid,
635                             vmregset->count, regnums, regvals);
636                 if (error == 0)
637                         error = copyout(regvals, vmregset->regvals,
638                             sizeof(regvals[0]) * vmregset->count);
639                 free(regvals, M_VMMDEV);
640                 free(regnums, M_VMMDEV);
641                 break;
642         case VM_SET_REGISTER_SET:
643                 vmregset = (struct vm_register_set *)data;
644                 if (vmregset->count > VM_REG_LAST) {
645                         error = EINVAL;
646                         break;
647                 }
648                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
649                     M_WAITOK);
650                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
651                     M_WAITOK);
652                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
653                     vmregset->count);
654                 if (error == 0)
655                         error = copyin(vmregset->regvals, regvals,
656                             sizeof(regvals[0]) * vmregset->count);
657                 if (error == 0)
658                         error = vm_set_register_set(sc->vm, vmregset->cpuid,
659                             vmregset->count, regnums, regvals);
660                 free(regvals, M_VMMDEV);
661                 free(regnums, M_VMMDEV);
662                 break;
663         case VM_GET_CAPABILITY:
664                 vmcap = (struct vm_capability *)data;
665                 error = vm_get_capability(sc->vm, vmcap->cpuid,
666                                           vmcap->captype,
667                                           &vmcap->capval);
668                 break;
669         case VM_SET_CAPABILITY:
670                 vmcap = (struct vm_capability *)data;
671                 error = vm_set_capability(sc->vm, vmcap->cpuid,
672                                           vmcap->captype,
673                                           vmcap->capval);
674                 break;
675         case VM_SET_X2APIC_STATE:
676                 x2apic = (struct vm_x2apic *)data;
677                 error = vm_set_x2apic_state(sc->vm,
678                                             x2apic->cpuid, x2apic->state);
679                 break;
680         case VM_GET_X2APIC_STATE:
681                 x2apic = (struct vm_x2apic *)data;
682                 error = vm_get_x2apic_state(sc->vm,
683                                             x2apic->cpuid, &x2apic->state);
684                 break;
685         case VM_GET_GPA_PMAP:
686                 gpapte = (struct vm_gpa_pte *)data;
687                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
688                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
689                 error = 0;
690                 break;
691         case VM_GET_HPET_CAPABILITIES:
692                 error = vhpet_getcap((struct vm_hpet_cap *)data);
693                 break;
694         case VM_GLA2GPA: {
695                 CTASSERT(PROT_READ == VM_PROT_READ);
696                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
697                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
698                 gg = (struct vm_gla2gpa *)data;
699                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
700                     gg->prot, &gg->gpa, &gg->fault);
701                 KASSERT(error == 0 || error == EFAULT,
702                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
703                 break;
704         }
705         case VM_GLA2GPA_NOFAULT:
706                 gg = (struct vm_gla2gpa *)data;
707                 error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
708                     gg->gla, gg->prot, &gg->gpa, &gg->fault);
709                 KASSERT(error == 0 || error == EFAULT,
710                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
711                 break;
712         case VM_ACTIVATE_CPU:
713                 vac = (struct vm_activate_cpu *)data;
714                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
715                 break;
716         case VM_GET_CPUS:
717                 error = 0;
718                 vm_cpuset = (struct vm_cpuset *)data;
719                 size = vm_cpuset->cpusetsize;
720                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
721                         error = ERANGE;
722                         break;
723                 }
724                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
725                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
726                         *cpuset = vm_active_cpus(sc->vm);
727                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
728                         *cpuset = vm_suspended_cpus(sc->vm);
729                 else if (vm_cpuset->which == VM_DEBUG_CPUS)
730                         *cpuset = vm_debug_cpus(sc->vm);
731                 else
732                         error = EINVAL;
733                 if (error == 0)
734                         error = copyout(cpuset, vm_cpuset->cpus, size);
735                 free(cpuset, M_TEMP);
736                 break;
737         case VM_SUSPEND_CPU:
738                 vac = (struct vm_activate_cpu *)data;
739                 error = vm_suspend_cpu(sc->vm, vac->vcpuid);
740                 break;
741         case VM_RESUME_CPU:
742                 vac = (struct vm_activate_cpu *)data;
743                 error = vm_resume_cpu(sc->vm, vac->vcpuid);
744                 break;
745         case VM_SET_INTINFO:
746                 vmii = (struct vm_intinfo *)data;
747                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
748                 break;
749         case VM_GET_INTINFO:
750                 vmii = (struct vm_intinfo *)data;
751                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
752                     &vmii->info2);
753                 break;
754         case VM_RTC_WRITE:
755                 rtcdata = (struct vm_rtc_data *)data;
756                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
757                     rtcdata->value);
758                 break;
759         case VM_RTC_READ:
760                 rtcdata = (struct vm_rtc_data *)data;
761                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
762                     &rtcdata->value);
763                 break;
764         case VM_RTC_SETTIME:
765                 rtctime = (struct vm_rtc_time *)data;
766                 error = vrtc_set_time(sc->vm, rtctime->secs);
767                 break;
768         case VM_RTC_GETTIME:
769                 error = 0;
770                 rtctime = (struct vm_rtc_time *)data;
771                 rtctime->secs = vrtc_get_time(sc->vm);
772                 break;
773         case VM_RESTART_INSTRUCTION:
774                 error = vm_restart_instruction(sc->vm, vcpu);
775                 break;
776         case VM_SET_TOPOLOGY:
777                 topology = (struct vm_cpu_topology *)data;
778                 error = vm_set_topology(sc->vm, topology->sockets,
779                     topology->cores, topology->threads, topology->maxcpus);
780                 break;
781         case VM_GET_TOPOLOGY:
782                 topology = (struct vm_cpu_topology *)data;
783                 vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
784                     &topology->threads, &topology->maxcpus);
785                 error = 0;
786                 break;
787         default:
788                 error = ENOTTY;
789                 break;
790         }
791
792         if (state_changed == 1)
793                 vcpu_unlock_one(sc, vcpu);
794         else if (state_changed == 2)
795                 vcpu_unlock_all(sc);
796
797 done:
798         /* Make sure that no handler returns a bogus value like ERESTART */
799         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
800         return (error);
801 }
802
803 static int
804 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
805     struct vm_object **objp, int nprot)
806 {
807         struct vmmdev_softc *sc;
808         vm_paddr_t gpa;
809         size_t len;
810         vm_ooffset_t segoff, first, last;
811         int error, found, segid;
812         uint16_t lastcpu;
813         bool sysmem;
814
815         error = vmm_priv_check(curthread->td_ucred);
816         if (error)
817                 return (error);
818
819         first = *offset;
820         last = first + mapsize;
821         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
822                 return (EINVAL);
823
824         sc = vmmdev_lookup2(cdev);
825         if (sc == NULL) {
826                 /* virtual machine is in the process of being created */
827                 return (EINVAL);
828         }
829
830         /*
831          * Get a read lock on the guest memory map by freezing any vcpu.
832          */
833         lastcpu = vm_get_maxcpus(sc->vm) - 1;
834         error = vcpu_lock_one(sc, lastcpu);
835         if (error)
836                 return (error);
837
838         gpa = 0;
839         found = 0;
840         while (!found) {
841                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
842                     NULL, NULL);
843                 if (error)
844                         break;
845
846                 if (first >= gpa && last <= gpa + len)
847                         found = 1;
848                 else
849                         gpa += len;
850         }
851
852         if (found) {
853                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
854                 KASSERT(error == 0 && *objp != NULL,
855                     ("%s: invalid memory segment %d", __func__, segid));
856                 if (sysmem) {
857                         vm_object_reference(*objp);
858                         *offset = segoff + (first - gpa);
859                 } else {
860                         error = EINVAL;
861                 }
862         }
863         vcpu_unlock_one(sc, lastcpu);
864         return (error);
865 }
866
867 static void
868 vmmdev_destroy(void *arg)
869 {
870         struct vmmdev_softc *sc = arg;
871         struct devmem_softc *dsc;
872         int error;
873
874         error = vcpu_lock_all(sc);
875         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
876
877         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
878                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
879                 SLIST_REMOVE_HEAD(&sc->devmem, link);
880                 free(dsc->name, M_VMMDEV);
881                 free(dsc, M_VMMDEV);
882         }
883
884         if (sc->cdev != NULL)
885                 destroy_dev(sc->cdev);
886
887         if (sc->vm != NULL)
888                 vm_destroy(sc->vm);
889
890         if ((sc->flags & VSC_LINKED) != 0) {
891                 mtx_lock(&vmmdev_mtx);
892                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
893                 mtx_unlock(&vmmdev_mtx);
894         }
895
896         free(sc, M_VMMDEV);
897 }
898
899 static int
900 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
901 {
902         struct devmem_softc *dsc;
903         struct vmmdev_softc *sc;
904         struct cdev *cdev;
905         char *buf;
906         int error, buflen;
907
908         error = vmm_priv_check(req->td->td_ucred);
909         if (error)
910                 return (error);
911
912         buflen = VM_MAX_NAMELEN + 1;
913         buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
914         strlcpy(buf, "beavis", buflen);
915         error = sysctl_handle_string(oidp, buf, buflen, req);
916         if (error != 0 || req->newptr == NULL)
917                 goto out;
918
919         mtx_lock(&vmmdev_mtx);
920         sc = vmmdev_lookup(buf);
921         if (sc == NULL || sc->cdev == NULL) {
922                 mtx_unlock(&vmmdev_mtx);
923                 error = EINVAL;
924                 goto out;
925         }
926
927         /*
928          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
929          * goes down to 0 so we should not do it again in the callback.
930          *
931          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
932          * is scheduled for destruction.
933          */
934         cdev = sc->cdev;
935         sc->cdev = NULL;                
936         mtx_unlock(&vmmdev_mtx);
937
938         /*
939          * Schedule all cdevs to be destroyed:
940          *
941          * - any new operations on the 'cdev' will return an error (ENXIO).
942          *
943          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
944          *   be destroyed and the callback will be invoked in a taskqueue
945          *   context.
946          *
947          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
948          */
949         SLIST_FOREACH(dsc, &sc->devmem, link) {
950                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
951                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
952         }
953         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
954         error = 0;
955
956 out:
957         free(buf, M_VMMDEV);
958         return (error);
959 }
960 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
961             CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
962             NULL, 0, sysctl_vmm_destroy, "A", NULL);
963
964 static struct cdevsw vmmdevsw = {
965         .d_name         = "vmmdev",
966         .d_version      = D_VERSION,
967         .d_ioctl        = vmmdev_ioctl,
968         .d_mmap_single  = vmmdev_mmap_single,
969         .d_read         = vmmdev_rw,
970         .d_write        = vmmdev_rw,
971 };
972
973 static int
974 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
975 {
976         struct vm *vm;
977         struct cdev *cdev;
978         struct vmmdev_softc *sc, *sc2;
979         char *buf;
980         int error, buflen;
981
982         error = vmm_priv_check(req->td->td_ucred);
983         if (error)
984                 return (error);
985
986         buflen = VM_MAX_NAMELEN + 1;
987         buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
988         strlcpy(buf, "beavis", buflen);
989         error = sysctl_handle_string(oidp, buf, buflen, req);
990         if (error != 0 || req->newptr == NULL)
991                 goto out;
992
993         mtx_lock(&vmmdev_mtx);
994         sc = vmmdev_lookup(buf);
995         mtx_unlock(&vmmdev_mtx);
996         if (sc != NULL) {
997                 error = EEXIST;
998                 goto out;
999         }
1000
1001         error = vm_create(buf, &vm);
1002         if (error != 0)
1003                 goto out;
1004
1005         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1006         sc->vm = vm;
1007         SLIST_INIT(&sc->devmem);
1008
1009         /*
1010          * Lookup the name again just in case somebody sneaked in when we
1011          * dropped the lock.
1012          */
1013         mtx_lock(&vmmdev_mtx);
1014         sc2 = vmmdev_lookup(buf);
1015         if (sc2 == NULL) {
1016                 SLIST_INSERT_HEAD(&head, sc, link);
1017                 sc->flags |= VSC_LINKED;
1018         }
1019         mtx_unlock(&vmmdev_mtx);
1020
1021         if (sc2 != NULL) {
1022                 vmmdev_destroy(sc);
1023                 error = EEXIST;
1024                 goto out;
1025         }
1026
1027         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
1028                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1029         if (error != 0) {
1030                 vmmdev_destroy(sc);
1031                 goto out;
1032         }
1033
1034         mtx_lock(&vmmdev_mtx);
1035         sc->cdev = cdev;
1036         sc->cdev->si_drv1 = sc;
1037         mtx_unlock(&vmmdev_mtx);
1038
1039 out:
1040         free(buf, M_VMMDEV);
1041         return (error);
1042 }
1043 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1044             CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
1045             NULL, 0, sysctl_vmm_create, "A", NULL);
1046
1047 void
1048 vmmdev_init(void)
1049 {
1050         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
1051         pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1052             "Allow use of vmm in a jail.");
1053 }
1054
1055 int
1056 vmmdev_cleanup(void)
1057 {
1058         int error;
1059
1060         if (SLIST_EMPTY(&head))
1061                 error = 0;
1062         else
1063                 error = EBUSY;
1064
1065         return (error);
1066 }
1067
1068 static int
1069 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1070     struct vm_object **objp, int nprot)
1071 {
1072         struct devmem_softc *dsc;
1073         vm_ooffset_t first, last;
1074         size_t seglen;
1075         int error;
1076         uint16_t lastcpu;
1077         bool sysmem;
1078
1079         dsc = cdev->si_drv1;
1080         if (dsc == NULL) {
1081                 /* 'cdev' has been created but is not ready for use */
1082                 return (ENXIO);
1083         }
1084
1085         first = *offset;
1086         last = *offset + len;
1087         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1088                 return (EINVAL);
1089
1090         lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
1091         error = vcpu_lock_one(dsc->sc, lastcpu);
1092         if (error)
1093                 return (error);
1094
1095         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1096         KASSERT(error == 0 && !sysmem && *objp != NULL,
1097             ("%s: invalid devmem segment %d", __func__, dsc->segid));
1098
1099         vcpu_unlock_one(dsc->sc, lastcpu);
1100
1101         if (seglen >= last) {
1102                 vm_object_reference(*objp);
1103                 return (0);
1104         } else {
1105                 return (EINVAL);
1106         }
1107 }
1108
1109 static struct cdevsw devmemsw = {
1110         .d_name         = "devmem",
1111         .d_version      = D_VERSION,
1112         .d_mmap_single  = devmem_mmap_single,
1113 };
1114
1115 static int
1116 devmem_create_cdev(const char *vmname, int segid, char *devname)
1117 {
1118         struct devmem_softc *dsc;
1119         struct vmmdev_softc *sc;
1120         struct cdev *cdev;
1121         int error;
1122
1123         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1124             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1125         if (error)
1126                 return (error);
1127
1128         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1129
1130         mtx_lock(&vmmdev_mtx);
1131         sc = vmmdev_lookup(vmname);
1132         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1133         if (sc->cdev == NULL) {
1134                 /* virtual machine is being created or destroyed */
1135                 mtx_unlock(&vmmdev_mtx);
1136                 free(dsc, M_VMMDEV);
1137                 destroy_dev_sched_cb(cdev, NULL, 0);
1138                 return (ENODEV);
1139         }
1140
1141         dsc->segid = segid;
1142         dsc->name = devname;
1143         dsc->cdev = cdev;
1144         dsc->sc = sc;
1145         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1146         mtx_unlock(&vmmdev_mtx);
1147
1148         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
1149         cdev->si_drv1 = dsc;
1150         return (0);
1151 }
1152
1153 static void
1154 devmem_destroy(void *arg)
1155 {
1156         struct devmem_softc *dsc = arg;
1157
1158         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1159         dsc->cdev = NULL;
1160         dsc->sc = NULL;
1161 }