]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
MFV r348535: 9677 panic from zio_write_gang_block() when creating dump device on...
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/jail.h>
37 #include <sys/queue.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/malloc.h>
41 #include <sys/conf.h>
42 #include <sys/sysctl.h>
43 #include <sys/libkern.h>
44 #include <sys/ioccom.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 #include <sys/proc.h>
48
49 #include <vm/vm.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_object.h>
53
54 #include <machine/vmparam.h>
55 #include <machine/vmm.h>
56 #include <machine/vmm_instruction_emul.h>
57 #include <machine/vmm_dev.h>
58
59 #include "vmm_lapic.h"
60 #include "vmm_stat.h"
61 #include "vmm_mem.h"
62 #include "io/ppt.h"
63 #include "io/vatpic.h"
64 #include "io/vioapic.h"
65 #include "io/vhpet.h"
66 #include "io/vrtc.h"
67
68 struct devmem_softc {
69         int     segid;
70         char    *name;
71         struct cdev *cdev;
72         struct vmmdev_softc *sc;
73         SLIST_ENTRY(devmem_softc) link;
74 };
75
76 struct vmmdev_softc {
77         struct vm       *vm;            /* vm instance cookie */
78         struct cdev     *cdev;
79         SLIST_ENTRY(vmmdev_softc) link;
80         SLIST_HEAD(, devmem_softc) devmem;
81         int             flags;
82 };
83 #define VSC_LINKED              0x01
84
85 static SLIST_HEAD(, vmmdev_softc) head;
86
87 static unsigned pr_allow_flag;
88 static struct mtx vmmdev_mtx;
89
90 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
91
92 SYSCTL_DECL(_hw_vmm);
93
94 static int vmm_priv_check(struct ucred *ucred);
95 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
96 static void devmem_destroy(void *arg);
97
98 static int
99 vmm_priv_check(struct ucred *ucred)
100 {
101
102         if (jailed(ucred) &&
103             !(ucred->cr_prison->pr_allow & pr_allow_flag))
104                 return (EPERM);
105
106         return (0);
107 }
108
109 static int
110 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
111 {
112         int error;
113
114         if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
115                 return (EINVAL);
116
117         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
118         return (error);
119 }
120
121 static void
122 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
123 {
124         enum vcpu_state state;
125
126         state = vcpu_get_state(sc->vm, vcpu, NULL);
127         if (state != VCPU_FROZEN) {
128                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
129                     vcpu, state);
130         }
131
132         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
133 }
134
135 static int
136 vcpu_lock_all(struct vmmdev_softc *sc)
137 {
138         int error, vcpu;
139         uint16_t maxcpus;
140
141         maxcpus = vm_get_maxcpus(sc->vm);
142         for (vcpu = 0; vcpu < maxcpus; vcpu++) {
143                 error = vcpu_lock_one(sc, vcpu);
144                 if (error)
145                         break;
146         }
147
148         if (error) {
149                 while (--vcpu >= 0)
150                         vcpu_unlock_one(sc, vcpu);
151         }
152
153         return (error);
154 }
155
156 static void
157 vcpu_unlock_all(struct vmmdev_softc *sc)
158 {
159         int vcpu;
160         uint16_t maxcpus;
161
162         maxcpus = vm_get_maxcpus(sc->vm);
163         for (vcpu = 0; vcpu < maxcpus; vcpu++)
164                 vcpu_unlock_one(sc, vcpu);
165 }
166
167 static struct vmmdev_softc *
168 vmmdev_lookup(const char *name)
169 {
170         struct vmmdev_softc *sc;
171
172 #ifdef notyet   /* XXX kernel is not compiled with invariants */
173         mtx_assert(&vmmdev_mtx, MA_OWNED);
174 #endif
175
176         SLIST_FOREACH(sc, &head, link) {
177                 if (strcmp(name, vm_name(sc->vm)) == 0)
178                         break;
179         }
180
181         return (sc);
182 }
183
184 static struct vmmdev_softc *
185 vmmdev_lookup2(struct cdev *cdev)
186 {
187
188         return (cdev->si_drv1);
189 }
190
191 static int
192 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
193 {
194         int error, off, c, prot;
195         vm_paddr_t gpa, maxaddr;
196         void *hpa, *cookie;
197         struct vmmdev_softc *sc;
198         uint16_t lastcpu;
199
200         error = vmm_priv_check(curthread->td_ucred);
201         if (error)
202                 return (error);
203
204         sc = vmmdev_lookup2(cdev);
205         if (sc == NULL)
206                 return (ENXIO);
207
208         /*
209          * Get a read lock on the guest memory map by freezing any vcpu.
210          */
211         lastcpu = vm_get_maxcpus(sc->vm) - 1;
212         error = vcpu_lock_one(sc, lastcpu);
213         if (error)
214                 return (error);
215
216         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
217         maxaddr = vmm_sysmem_maxaddr(sc->vm);
218         while (uio->uio_resid > 0 && error == 0) {
219                 gpa = uio->uio_offset;
220                 off = gpa & PAGE_MASK;
221                 c = min(uio->uio_resid, PAGE_SIZE - off);
222
223                 /*
224                  * The VM has a hole in its physical memory map. If we want to
225                  * use 'dd' to inspect memory beyond the hole we need to
226                  * provide bogus data for memory that lies in the hole.
227                  *
228                  * Since this device does not support lseek(2), dd(1) will
229                  * read(2) blocks of data to simulate the lseek(2).
230                  */
231                 hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
232                     prot, &cookie);
233                 if (hpa == NULL) {
234                         if (uio->uio_rw == UIO_READ && gpa < maxaddr)
235                                 error = uiomove(__DECONST(void *, zero_region),
236                                     c, uio);
237                         else
238                                 error = EFAULT;
239                 } else {
240                         error = uiomove(hpa, c, uio);
241                         vm_gpa_release(cookie);
242                 }
243         }
244         vcpu_unlock_one(sc, lastcpu);
245         return (error);
246 }
247
248 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
249
250 static int
251 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
252 {
253         struct devmem_softc *dsc;
254         int error;
255         bool sysmem;
256
257         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
258         if (error || mseg->len == 0)
259                 return (error);
260
261         if (!sysmem) {
262                 SLIST_FOREACH(dsc, &sc->devmem, link) {
263                         if (dsc->segid == mseg->segid)
264                                 break;
265                 }
266                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
267                     __func__, mseg->segid));
268                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
269         } else {
270                 bzero(mseg->name, sizeof(mseg->name));
271         }
272
273         return (error);
274 }
275
276 static int
277 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
278 {
279         char *name;
280         int error;
281         bool sysmem;
282
283         error = 0;
284         name = NULL;
285         sysmem = true;
286
287         if (VM_MEMSEG_NAME(mseg)) {
288                 sysmem = false;
289                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
290                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
291                 if (error)
292                         goto done;
293         }
294
295         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
296         if (error)
297                 goto done;
298
299         if (VM_MEMSEG_NAME(mseg)) {
300                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
301                 if (error)
302                         vm_free_memseg(sc->vm, mseg->segid);
303                 else
304                         name = NULL;    /* freed when 'cdev' is destroyed */
305         }
306 done:
307         free(name, M_VMMDEV);
308         return (error);
309 }
310
311 static int
312 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
313     uint64_t *regval)
314 {
315         int error, i;
316
317         error = 0;
318         for (i = 0; i < count; i++) {
319                 error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
320                 if (error)
321                         break;
322         }
323         return (error);
324 }
325
326 static int
327 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
328     uint64_t *regval)
329 {
330         int error, i;
331
332         error = 0;
333         for (i = 0; i < count; i++) {
334                 error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
335                 if (error)
336                         break;
337         }
338         return (error);
339 }
340
341 static int
342 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
343              struct thread *td)
344 {
345         int error, vcpu, state_changed, size;
346         cpuset_t *cpuset;
347         struct vmmdev_softc *sc;
348         struct vm_register *vmreg;
349         struct vm_seg_desc *vmsegdesc;
350         struct vm_register_set *vmregset;
351         struct vm_run *vmrun;
352         struct vm_exception *vmexc;
353         struct vm_lapic_irq *vmirq;
354         struct vm_lapic_msi *vmmsi;
355         struct vm_ioapic_irq *ioapic_irq;
356         struct vm_isa_irq *isa_irq;
357         struct vm_isa_irq_trigger *isa_irq_trigger;
358         struct vm_capability *vmcap;
359         struct vm_pptdev *pptdev;
360         struct vm_pptdev_mmio *pptmmio;
361         struct vm_pptdev_msi *pptmsi;
362         struct vm_pptdev_msix *pptmsix;
363         struct vm_nmi *vmnmi;
364         struct vm_stats *vmstats;
365         struct vm_stat_desc *statdesc;
366         struct vm_x2apic *x2apic;
367         struct vm_gpa_pte *gpapte;
368         struct vm_suspend *vmsuspend;
369         struct vm_gla2gpa *gg;
370         struct vm_activate_cpu *vac;
371         struct vm_cpuset *vm_cpuset;
372         struct vm_intinfo *vmii;
373         struct vm_rtc_time *rtctime;
374         struct vm_rtc_data *rtcdata;
375         struct vm_memmap *mm;
376         struct vm_cpu_topology *topology;
377         uint64_t *regvals;
378         int *regnums;
379
380         error = vmm_priv_check(curthread->td_ucred);
381         if (error)
382                 return (error);
383
384         sc = vmmdev_lookup2(cdev);
385         if (sc == NULL)
386                 return (ENXIO);
387
388         vcpu = -1;
389         state_changed = 0;
390
391         /*
392          * Some VMM ioctls can operate only on vcpus that are not running.
393          */
394         switch (cmd) {
395         case VM_RUN:
396         case VM_GET_REGISTER:
397         case VM_SET_REGISTER:
398         case VM_GET_SEGMENT_DESCRIPTOR:
399         case VM_SET_SEGMENT_DESCRIPTOR:
400         case VM_GET_REGISTER_SET:
401         case VM_SET_REGISTER_SET:
402         case VM_INJECT_EXCEPTION:
403         case VM_GET_CAPABILITY:
404         case VM_SET_CAPABILITY:
405         case VM_PPTDEV_MSI:
406         case VM_PPTDEV_MSIX:
407         case VM_SET_X2APIC_STATE:
408         case VM_GLA2GPA:
409         case VM_GLA2GPA_NOFAULT:
410         case VM_ACTIVATE_CPU:
411         case VM_SET_INTINFO:
412         case VM_GET_INTINFO:
413         case VM_RESTART_INSTRUCTION:
414                 /*
415                  * XXX fragile, handle with care
416                  * Assumes that the first field of the ioctl data is the vcpu.
417                  */
418                 vcpu = *(int *)data;
419                 error = vcpu_lock_one(sc, vcpu);
420                 if (error)
421                         goto done;
422                 state_changed = 1;
423                 break;
424
425         case VM_MAP_PPTDEV_MMIO:
426         case VM_BIND_PPTDEV:
427         case VM_UNBIND_PPTDEV:
428         case VM_ALLOC_MEMSEG:
429         case VM_MMAP_MEMSEG:
430         case VM_REINIT:
431                 /*
432                  * ioctls that operate on the entire virtual machine must
433                  * prevent all vcpus from running.
434                  */
435                 error = vcpu_lock_all(sc);
436                 if (error)
437                         goto done;
438                 state_changed = 2;
439                 break;
440
441         case VM_GET_MEMSEG:
442         case VM_MMAP_GETNEXT:
443                 /*
444                  * Lock a vcpu to make sure that the memory map cannot be
445                  * modified while it is being inspected.
446                  */
447                 vcpu = vm_get_maxcpus(sc->vm) - 1;
448                 error = vcpu_lock_one(sc, vcpu);
449                 if (error)
450                         goto done;
451                 state_changed = 1;
452                 break;
453
454         default:
455                 break;
456         }
457
458         switch(cmd) {
459         case VM_RUN:
460                 vmrun = (struct vm_run *)data;
461                 error = vm_run(sc->vm, vmrun);
462                 break;
463         case VM_SUSPEND:
464                 vmsuspend = (struct vm_suspend *)data;
465                 error = vm_suspend(sc->vm, vmsuspend->how);
466                 break;
467         case VM_REINIT:
468                 error = vm_reinit(sc->vm);
469                 break;
470         case VM_STAT_DESC: {
471                 statdesc = (struct vm_stat_desc *)data;
472                 error = vmm_stat_desc_copy(statdesc->index,
473                                         statdesc->desc, sizeof(statdesc->desc));
474                 break;
475         }
476         case VM_STATS: {
477                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
478                 vmstats = (struct vm_stats *)data;
479                 getmicrotime(&vmstats->tv);
480                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
481                                       &vmstats->num_entries, vmstats->statbuf);
482                 break;
483         }
484         case VM_PPTDEV_MSI:
485                 pptmsi = (struct vm_pptdev_msi *)data;
486                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
487                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
488                                       pptmsi->addr, pptmsi->msg,
489                                       pptmsi->numvec);
490                 break;
491         case VM_PPTDEV_MSIX:
492                 pptmsix = (struct vm_pptdev_msix *)data;
493                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
494                                        pptmsix->bus, pptmsix->slot, 
495                                        pptmsix->func, pptmsix->idx,
496                                        pptmsix->addr, pptmsix->msg,
497                                        pptmsix->vector_control);
498                 break;
499         case VM_MAP_PPTDEV_MMIO:
500                 pptmmio = (struct vm_pptdev_mmio *)data;
501                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
502                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
503                                      pptmmio->hpa);
504                 break;
505         case VM_BIND_PPTDEV:
506                 pptdev = (struct vm_pptdev *)data;
507                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
508                                          pptdev->func);
509                 break;
510         case VM_UNBIND_PPTDEV:
511                 pptdev = (struct vm_pptdev *)data;
512                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
513                                            pptdev->func);
514                 break;
515         case VM_INJECT_EXCEPTION:
516                 vmexc = (struct vm_exception *)data;
517                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
518                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
519                     vmexc->restart_instruction);
520                 break;
521         case VM_INJECT_NMI:
522                 vmnmi = (struct vm_nmi *)data;
523                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
524                 break;
525         case VM_LAPIC_IRQ:
526                 vmirq = (struct vm_lapic_irq *)data;
527                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
528                 break;
529         case VM_LAPIC_LOCAL_IRQ:
530                 vmirq = (struct vm_lapic_irq *)data;
531                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
532                     vmirq->vector);
533                 break;
534         case VM_LAPIC_MSI:
535                 vmmsi = (struct vm_lapic_msi *)data;
536                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
537                 break;
538         case VM_IOAPIC_ASSERT_IRQ:
539                 ioapic_irq = (struct vm_ioapic_irq *)data;
540                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
541                 break;
542         case VM_IOAPIC_DEASSERT_IRQ:
543                 ioapic_irq = (struct vm_ioapic_irq *)data;
544                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
545                 break;
546         case VM_IOAPIC_PULSE_IRQ:
547                 ioapic_irq = (struct vm_ioapic_irq *)data;
548                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
549                 break;
550         case VM_IOAPIC_PINCOUNT:
551                 *(int *)data = vioapic_pincount(sc->vm);
552                 break;
553         case VM_ISA_ASSERT_IRQ:
554                 isa_irq = (struct vm_isa_irq *)data;
555                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
556                 if (error == 0 && isa_irq->ioapic_irq != -1)
557                         error = vioapic_assert_irq(sc->vm,
558                             isa_irq->ioapic_irq);
559                 break;
560         case VM_ISA_DEASSERT_IRQ:
561                 isa_irq = (struct vm_isa_irq *)data;
562                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
563                 if (error == 0 && isa_irq->ioapic_irq != -1)
564                         error = vioapic_deassert_irq(sc->vm,
565                             isa_irq->ioapic_irq);
566                 break;
567         case VM_ISA_PULSE_IRQ:
568                 isa_irq = (struct vm_isa_irq *)data;
569                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
570                 if (error == 0 && isa_irq->ioapic_irq != -1)
571                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
572                 break;
573         case VM_ISA_SET_IRQ_TRIGGER:
574                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
575                 error = vatpic_set_irq_trigger(sc->vm,
576                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
577                 break;
578         case VM_MMAP_GETNEXT:
579                 mm = (struct vm_memmap *)data;
580                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
581                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
582                 break;
583         case VM_MMAP_MEMSEG:
584                 mm = (struct vm_memmap *)data;
585                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
586                     mm->len, mm->prot, mm->flags);
587                 break;
588         case VM_ALLOC_MEMSEG:
589                 error = alloc_memseg(sc, (struct vm_memseg *)data);
590                 break;
591         case VM_GET_MEMSEG:
592                 error = get_memseg(sc, (struct vm_memseg *)data);
593                 break;
594         case VM_GET_REGISTER:
595                 vmreg = (struct vm_register *)data;
596                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
597                                         &vmreg->regval);
598                 break;
599         case VM_SET_REGISTER:
600                 vmreg = (struct vm_register *)data;
601                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
602                                         vmreg->regval);
603                 break;
604         case VM_SET_SEGMENT_DESCRIPTOR:
605                 vmsegdesc = (struct vm_seg_desc *)data;
606                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
607                                         vmsegdesc->regnum,
608                                         &vmsegdesc->desc);
609                 break;
610         case VM_GET_SEGMENT_DESCRIPTOR:
611                 vmsegdesc = (struct vm_seg_desc *)data;
612                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
613                                         vmsegdesc->regnum,
614                                         &vmsegdesc->desc);
615                 break;
616         case VM_GET_REGISTER_SET:
617                 vmregset = (struct vm_register_set *)data;
618                 if (vmregset->count > VM_REG_LAST) {
619                         error = EINVAL;
620                         break;
621                 }
622                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
623                     M_WAITOK);
624                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
625                     M_WAITOK);
626                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
627                     vmregset->count);
628                 if (error == 0)
629                         error = vm_get_register_set(sc->vm, vmregset->cpuid,
630                             vmregset->count, regnums, regvals);
631                 if (error == 0)
632                         error = copyout(regvals, vmregset->regvals,
633                             sizeof(regvals[0]) * vmregset->count);
634                 free(regvals, M_VMMDEV);
635                 free(regnums, M_VMMDEV);
636                 break;
637         case VM_SET_REGISTER_SET:
638                 vmregset = (struct vm_register_set *)data;
639                 if (vmregset->count > VM_REG_LAST) {
640                         error = EINVAL;
641                         break;
642                 }
643                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
644                     M_WAITOK);
645                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
646                     M_WAITOK);
647                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
648                     vmregset->count);
649                 if (error == 0)
650                         error = copyin(vmregset->regvals, regvals,
651                             sizeof(regvals[0]) * vmregset->count);
652                 if (error == 0)
653                         error = vm_set_register_set(sc->vm, vmregset->cpuid,
654                             vmregset->count, regnums, regvals);
655                 free(regvals, M_VMMDEV);
656                 free(regnums, M_VMMDEV);
657                 break;
658         case VM_GET_CAPABILITY:
659                 vmcap = (struct vm_capability *)data;
660                 error = vm_get_capability(sc->vm, vmcap->cpuid,
661                                           vmcap->captype,
662                                           &vmcap->capval);
663                 break;
664         case VM_SET_CAPABILITY:
665                 vmcap = (struct vm_capability *)data;
666                 error = vm_set_capability(sc->vm, vmcap->cpuid,
667                                           vmcap->captype,
668                                           vmcap->capval);
669                 break;
670         case VM_SET_X2APIC_STATE:
671                 x2apic = (struct vm_x2apic *)data;
672                 error = vm_set_x2apic_state(sc->vm,
673                                             x2apic->cpuid, x2apic->state);
674                 break;
675         case VM_GET_X2APIC_STATE:
676                 x2apic = (struct vm_x2apic *)data;
677                 error = vm_get_x2apic_state(sc->vm,
678                                             x2apic->cpuid, &x2apic->state);
679                 break;
680         case VM_GET_GPA_PMAP:
681                 gpapte = (struct vm_gpa_pte *)data;
682                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
683                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
684                 error = 0;
685                 break;
686         case VM_GET_HPET_CAPABILITIES:
687                 error = vhpet_getcap((struct vm_hpet_cap *)data);
688                 break;
689         case VM_GLA2GPA: {
690                 CTASSERT(PROT_READ == VM_PROT_READ);
691                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
692                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
693                 gg = (struct vm_gla2gpa *)data;
694                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
695                     gg->prot, &gg->gpa, &gg->fault);
696                 KASSERT(error == 0 || error == EFAULT,
697                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
698                 break;
699         }
700         case VM_GLA2GPA_NOFAULT:
701                 gg = (struct vm_gla2gpa *)data;
702                 error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
703                     gg->gla, gg->prot, &gg->gpa, &gg->fault);
704                 KASSERT(error == 0 || error == EFAULT,
705                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
706                 break;
707         case VM_ACTIVATE_CPU:
708                 vac = (struct vm_activate_cpu *)data;
709                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
710                 break;
711         case VM_GET_CPUS:
712                 error = 0;
713                 vm_cpuset = (struct vm_cpuset *)data;
714                 size = vm_cpuset->cpusetsize;
715                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
716                         error = ERANGE;
717                         break;
718                 }
719                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
720                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
721                         *cpuset = vm_active_cpus(sc->vm);
722                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
723                         *cpuset = vm_suspended_cpus(sc->vm);
724                 else if (vm_cpuset->which == VM_DEBUG_CPUS)
725                         *cpuset = vm_debug_cpus(sc->vm);
726                 else
727                         error = EINVAL;
728                 if (error == 0)
729                         error = copyout(cpuset, vm_cpuset->cpus, size);
730                 free(cpuset, M_TEMP);
731                 break;
732         case VM_SUSPEND_CPU:
733                 vac = (struct vm_activate_cpu *)data;
734                 error = vm_suspend_cpu(sc->vm, vac->vcpuid);
735                 break;
736         case VM_RESUME_CPU:
737                 vac = (struct vm_activate_cpu *)data;
738                 error = vm_resume_cpu(sc->vm, vac->vcpuid);
739                 break;
740         case VM_SET_INTINFO:
741                 vmii = (struct vm_intinfo *)data;
742                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
743                 break;
744         case VM_GET_INTINFO:
745                 vmii = (struct vm_intinfo *)data;
746                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
747                     &vmii->info2);
748                 break;
749         case VM_RTC_WRITE:
750                 rtcdata = (struct vm_rtc_data *)data;
751                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
752                     rtcdata->value);
753                 break;
754         case VM_RTC_READ:
755                 rtcdata = (struct vm_rtc_data *)data;
756                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
757                     &rtcdata->value);
758                 break;
759         case VM_RTC_SETTIME:
760                 rtctime = (struct vm_rtc_time *)data;
761                 error = vrtc_set_time(sc->vm, rtctime->secs);
762                 break;
763         case VM_RTC_GETTIME:
764                 error = 0;
765                 rtctime = (struct vm_rtc_time *)data;
766                 rtctime->secs = vrtc_get_time(sc->vm);
767                 break;
768         case VM_RESTART_INSTRUCTION:
769                 error = vm_restart_instruction(sc->vm, vcpu);
770                 break;
771         case VM_SET_TOPOLOGY:
772                 topology = (struct vm_cpu_topology *)data;
773                 error = vm_set_topology(sc->vm, topology->sockets,
774                     topology->cores, topology->threads, topology->maxcpus);
775                 break;
776         case VM_GET_TOPOLOGY:
777                 topology = (struct vm_cpu_topology *)data;
778                 vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
779                     &topology->threads, &topology->maxcpus);
780                 error = 0;
781                 break;
782         default:
783                 error = ENOTTY;
784                 break;
785         }
786
787         if (state_changed == 1)
788                 vcpu_unlock_one(sc, vcpu);
789         else if (state_changed == 2)
790                 vcpu_unlock_all(sc);
791
792 done:
793         /* Make sure that no handler returns a bogus value like ERESTART */
794         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
795         return (error);
796 }
797
798 static int
799 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
800     struct vm_object **objp, int nprot)
801 {
802         struct vmmdev_softc *sc;
803         vm_paddr_t gpa;
804         size_t len;
805         vm_ooffset_t segoff, first, last;
806         int error, found, segid;
807         uint16_t lastcpu;
808         bool sysmem;
809
810         error = vmm_priv_check(curthread->td_ucred);
811         if (error)
812                 return (error);
813
814         first = *offset;
815         last = first + mapsize;
816         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
817                 return (EINVAL);
818
819         sc = vmmdev_lookup2(cdev);
820         if (sc == NULL) {
821                 /* virtual machine is in the process of being created */
822                 return (EINVAL);
823         }
824
825         /*
826          * Get a read lock on the guest memory map by freezing any vcpu.
827          */
828         lastcpu = vm_get_maxcpus(sc->vm) - 1;
829         error = vcpu_lock_one(sc, lastcpu);
830         if (error)
831                 return (error);
832
833         gpa = 0;
834         found = 0;
835         while (!found) {
836                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
837                     NULL, NULL);
838                 if (error)
839                         break;
840
841                 if (first >= gpa && last <= gpa + len)
842                         found = 1;
843                 else
844                         gpa += len;
845         }
846
847         if (found) {
848                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
849                 KASSERT(error == 0 && *objp != NULL,
850                     ("%s: invalid memory segment %d", __func__, segid));
851                 if (sysmem) {
852                         vm_object_reference(*objp);
853                         *offset = segoff + (first - gpa);
854                 } else {
855                         error = EINVAL;
856                 }
857         }
858         vcpu_unlock_one(sc, lastcpu);
859         return (error);
860 }
861
862 static void
863 vmmdev_destroy(void *arg)
864 {
865         struct vmmdev_softc *sc = arg;
866         struct devmem_softc *dsc;
867         int error;
868
869         error = vcpu_lock_all(sc);
870         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
871
872         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
873                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
874                 SLIST_REMOVE_HEAD(&sc->devmem, link);
875                 free(dsc->name, M_VMMDEV);
876                 free(dsc, M_VMMDEV);
877         }
878
879         if (sc->cdev != NULL)
880                 destroy_dev(sc->cdev);
881
882         if (sc->vm != NULL)
883                 vm_destroy(sc->vm);
884
885         if ((sc->flags & VSC_LINKED) != 0) {
886                 mtx_lock(&vmmdev_mtx);
887                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
888                 mtx_unlock(&vmmdev_mtx);
889         }
890
891         free(sc, M_VMMDEV);
892 }
893
894 static int
895 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
896 {
897         int error;
898         char buf[VM_MAX_NAMELEN];
899         struct devmem_softc *dsc;
900         struct vmmdev_softc *sc;
901         struct cdev *cdev;
902
903         error = vmm_priv_check(req->td->td_ucred);
904         if (error)
905                 return (error);
906
907         strlcpy(buf, "beavis", sizeof(buf));
908         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
909         if (error != 0 || req->newptr == NULL)
910                 return (error);
911
912         mtx_lock(&vmmdev_mtx);
913         sc = vmmdev_lookup(buf);
914         if (sc == NULL || sc->cdev == NULL) {
915                 mtx_unlock(&vmmdev_mtx);
916                 return (EINVAL);
917         }
918
919         /*
920          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
921          * goes down to 0 so we should not do it again in the callback.
922          *
923          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
924          * is scheduled for destruction.
925          */
926         cdev = sc->cdev;
927         sc->cdev = NULL;                
928         mtx_unlock(&vmmdev_mtx);
929
930         /*
931          * Schedule all cdevs to be destroyed:
932          *
933          * - any new operations on the 'cdev' will return an error (ENXIO).
934          *
935          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
936          *   be destroyed and the callback will be invoked in a taskqueue
937          *   context.
938          *
939          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
940          */
941         SLIST_FOREACH(dsc, &sc->devmem, link) {
942                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
943                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
944         }
945         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
946         return (0);
947 }
948 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
949             CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
950             NULL, 0, sysctl_vmm_destroy, "A", NULL);
951
952 static struct cdevsw vmmdevsw = {
953         .d_name         = "vmmdev",
954         .d_version      = D_VERSION,
955         .d_ioctl        = vmmdev_ioctl,
956         .d_mmap_single  = vmmdev_mmap_single,
957         .d_read         = vmmdev_rw,
958         .d_write        = vmmdev_rw,
959 };
960
961 static int
962 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
963 {
964         int error;
965         struct vm *vm;
966         struct cdev *cdev;
967         struct vmmdev_softc *sc, *sc2;
968         char buf[VM_MAX_NAMELEN];
969
970         error = vmm_priv_check(req->td->td_ucred);
971         if (error)
972                 return (error);
973
974         strlcpy(buf, "beavis", sizeof(buf));
975         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
976         if (error != 0 || req->newptr == NULL)
977                 return (error);
978
979         mtx_lock(&vmmdev_mtx);
980         sc = vmmdev_lookup(buf);
981         mtx_unlock(&vmmdev_mtx);
982         if (sc != NULL)
983                 return (EEXIST);
984
985         error = vm_create(buf, &vm);
986         if (error != 0)
987                 return (error);
988
989         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
990         sc->vm = vm;
991         SLIST_INIT(&sc->devmem);
992
993         /*
994          * Lookup the name again just in case somebody sneaked in when we
995          * dropped the lock.
996          */
997         mtx_lock(&vmmdev_mtx);
998         sc2 = vmmdev_lookup(buf);
999         if (sc2 == NULL) {
1000                 SLIST_INSERT_HEAD(&head, sc, link);
1001                 sc->flags |= VSC_LINKED;
1002         }
1003         mtx_unlock(&vmmdev_mtx);
1004
1005         if (sc2 != NULL) {
1006                 vmmdev_destroy(sc);
1007                 return (EEXIST);
1008         }
1009
1010         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
1011                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1012         if (error != 0) {
1013                 vmmdev_destroy(sc);
1014                 return (error);
1015         }
1016
1017         mtx_lock(&vmmdev_mtx);
1018         sc->cdev = cdev;
1019         sc->cdev->si_drv1 = sc;
1020         mtx_unlock(&vmmdev_mtx);
1021
1022         return (0);
1023 }
1024 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1025             CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
1026             NULL, 0, sysctl_vmm_create, "A", NULL);
1027
1028 void
1029 vmmdev_init(void)
1030 {
1031         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
1032         pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1033             "Allow use of vmm in a jail.");
1034 }
1035
1036 int
1037 vmmdev_cleanup(void)
1038 {
1039         int error;
1040
1041         if (SLIST_EMPTY(&head))
1042                 error = 0;
1043         else
1044                 error = EBUSY;
1045
1046         return (error);
1047 }
1048
1049 static int
1050 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1051     struct vm_object **objp, int nprot)
1052 {
1053         struct devmem_softc *dsc;
1054         vm_ooffset_t first, last;
1055         size_t seglen;
1056         int error;
1057         uint16_t lastcpu;
1058         bool sysmem;
1059
1060         dsc = cdev->si_drv1;
1061         if (dsc == NULL) {
1062                 /* 'cdev' has been created but is not ready for use */
1063                 return (ENXIO);
1064         }
1065
1066         first = *offset;
1067         last = *offset + len;
1068         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1069                 return (EINVAL);
1070
1071         lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
1072         error = vcpu_lock_one(dsc->sc, lastcpu);
1073         if (error)
1074                 return (error);
1075
1076         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1077         KASSERT(error == 0 && !sysmem && *objp != NULL,
1078             ("%s: invalid devmem segment %d", __func__, dsc->segid));
1079
1080         vcpu_unlock_one(dsc->sc, lastcpu);
1081
1082         if (seglen >= last) {
1083                 vm_object_reference(*objp);
1084                 return (0);
1085         } else {
1086                 return (EINVAL);
1087         }
1088 }
1089
1090 static struct cdevsw devmemsw = {
1091         .d_name         = "devmem",
1092         .d_version      = D_VERSION,
1093         .d_mmap_single  = devmem_mmap_single,
1094 };
1095
1096 static int
1097 devmem_create_cdev(const char *vmname, int segid, char *devname)
1098 {
1099         struct devmem_softc *dsc;
1100         struct vmmdev_softc *sc;
1101         struct cdev *cdev;
1102         int error;
1103
1104         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1105             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1106         if (error)
1107                 return (error);
1108
1109         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1110
1111         mtx_lock(&vmmdev_mtx);
1112         sc = vmmdev_lookup(vmname);
1113         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1114         if (sc->cdev == NULL) {
1115                 /* virtual machine is being created or destroyed */
1116                 mtx_unlock(&vmmdev_mtx);
1117                 free(dsc, M_VMMDEV);
1118                 destroy_dev_sched_cb(cdev, NULL, 0);
1119                 return (ENODEV);
1120         }
1121
1122         dsc->segid = segid;
1123         dsc->name = devname;
1124         dsc->cdev = cdev;
1125         dsc->sc = sc;
1126         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1127         mtx_unlock(&vmmdev_mtx);
1128
1129         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
1130         cdev->si_drv1 = dsc;
1131         return (0);
1132 }
1133
1134 static void
1135 devmem_destroy(void *arg)
1136 {
1137         struct devmem_softc *dsc = arg;
1138
1139         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1140         dsc->cdev = NULL;
1141         dsc->sc = NULL;
1142 }