]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
Update libdialog to 1.3-20180621
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/jail.h>
37 #include <sys/queue.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/malloc.h>
41 #include <sys/conf.h>
42 #include <sys/sysctl.h>
43 #include <sys/libkern.h>
44 #include <sys/ioccom.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 #include <sys/proc.h>
48
49 #include <vm/vm.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_object.h>
53
54 #include <machine/vmparam.h>
55 #include <machine/vmm.h>
56 #include <machine/vmm_instruction_emul.h>
57 #include <machine/vmm_dev.h>
58
59 #include "vmm_lapic.h"
60 #include "vmm_stat.h"
61 #include "vmm_mem.h"
62 #include "io/ppt.h"
63 #include "io/vatpic.h"
64 #include "io/vioapic.h"
65 #include "io/vhpet.h"
66 #include "io/vrtc.h"
67
68 struct devmem_softc {
69         int     segid;
70         char    *name;
71         struct cdev *cdev;
72         struct vmmdev_softc *sc;
73         SLIST_ENTRY(devmem_softc) link;
74 };
75
76 struct vmmdev_softc {
77         struct vm       *vm;            /* vm instance cookie */
78         struct cdev     *cdev;
79         SLIST_ENTRY(vmmdev_softc) link;
80         SLIST_HEAD(, devmem_softc) devmem;
81         int             flags;
82 };
83 #define VSC_LINKED              0x01
84
85 static SLIST_HEAD(, vmmdev_softc) head;
86
87 static unsigned pr_allow_flag;
88 static struct mtx vmmdev_mtx;
89
90 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
91
92 SYSCTL_DECL(_hw_vmm);
93
94 static int vmm_priv_check(struct ucred *ucred);
95 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
96 static void devmem_destroy(void *arg);
97
98 static int
99 vmm_priv_check(struct ucred *ucred)
100 {
101
102         if (jailed(ucred) &&
103             !(ucred->cr_prison->pr_allow & pr_allow_flag))
104                 return (EPERM);
105
106         return (0);
107 }
108
109 static int
110 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
111 {
112         int error;
113
114         if (vcpu < 0 || vcpu >= VM_MAXCPU)
115                 return (EINVAL);
116
117         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
118         return (error);
119 }
120
121 static void
122 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
123 {
124         enum vcpu_state state;
125
126         state = vcpu_get_state(sc->vm, vcpu, NULL);
127         if (state != VCPU_FROZEN) {
128                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
129                     vcpu, state);
130         }
131
132         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
133 }
134
135 static int
136 vcpu_lock_all(struct vmmdev_softc *sc)
137 {
138         int error, vcpu;
139
140         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
141                 error = vcpu_lock_one(sc, vcpu);
142                 if (error)
143                         break;
144         }
145
146         if (error) {
147                 while (--vcpu >= 0)
148                         vcpu_unlock_one(sc, vcpu);
149         }
150
151         return (error);
152 }
153
154 static void
155 vcpu_unlock_all(struct vmmdev_softc *sc)
156 {
157         int vcpu;
158
159         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
160                 vcpu_unlock_one(sc, vcpu);
161 }
162
163 static struct vmmdev_softc *
164 vmmdev_lookup(const char *name)
165 {
166         struct vmmdev_softc *sc;
167
168 #ifdef notyet   /* XXX kernel is not compiled with invariants */
169         mtx_assert(&vmmdev_mtx, MA_OWNED);
170 #endif
171
172         SLIST_FOREACH(sc, &head, link) {
173                 if (strcmp(name, vm_name(sc->vm)) == 0)
174                         break;
175         }
176
177         return (sc);
178 }
179
180 static struct vmmdev_softc *
181 vmmdev_lookup2(struct cdev *cdev)
182 {
183
184         return (cdev->si_drv1);
185 }
186
187 static int
188 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
189 {
190         int error, off, c, prot;
191         vm_paddr_t gpa, maxaddr;
192         void *hpa, *cookie;
193         struct vmmdev_softc *sc;
194
195         error = vmm_priv_check(curthread->td_ucred);
196         if (error)
197                 return (error);
198
199         sc = vmmdev_lookup2(cdev);
200         if (sc == NULL)
201                 return (ENXIO);
202
203         /*
204          * Get a read lock on the guest memory map by freezing any vcpu.
205          */
206         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
207         if (error)
208                 return (error);
209
210         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
211         maxaddr = vmm_sysmem_maxaddr(sc->vm);
212         while (uio->uio_resid > 0 && error == 0) {
213                 gpa = uio->uio_offset;
214                 off = gpa & PAGE_MASK;
215                 c = min(uio->uio_resid, PAGE_SIZE - off);
216
217                 /*
218                  * The VM has a hole in its physical memory map. If we want to
219                  * use 'dd' to inspect memory beyond the hole we need to
220                  * provide bogus data for memory that lies in the hole.
221                  *
222                  * Since this device does not support lseek(2), dd(1) will
223                  * read(2) blocks of data to simulate the lseek(2).
224                  */
225                 hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
226                 if (hpa == NULL) {
227                         if (uio->uio_rw == UIO_READ && gpa < maxaddr)
228                                 error = uiomove(__DECONST(void *, zero_region),
229                                     c, uio);
230                         else
231                                 error = EFAULT;
232                 } else {
233                         error = uiomove(hpa, c, uio);
234                         vm_gpa_release(cookie);
235                 }
236         }
237         vcpu_unlock_one(sc, VM_MAXCPU - 1);
238         return (error);
239 }
240
241 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
242
243 static int
244 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
245 {
246         struct devmem_softc *dsc;
247         int error;
248         bool sysmem;
249
250         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
251         if (error || mseg->len == 0)
252                 return (error);
253
254         if (!sysmem) {
255                 SLIST_FOREACH(dsc, &sc->devmem, link) {
256                         if (dsc->segid == mseg->segid)
257                                 break;
258                 }
259                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
260                     __func__, mseg->segid));
261                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
262         } else {
263                 bzero(mseg->name, sizeof(mseg->name));
264         }
265
266         return (error);
267 }
268
269 static int
270 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
271 {
272         char *name;
273         int error;
274         bool sysmem;
275
276         error = 0;
277         name = NULL;
278         sysmem = true;
279
280         if (VM_MEMSEG_NAME(mseg)) {
281                 sysmem = false;
282                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
283                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
284                 if (error)
285                         goto done;
286         }
287
288         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
289         if (error)
290                 goto done;
291
292         if (VM_MEMSEG_NAME(mseg)) {
293                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
294                 if (error)
295                         vm_free_memseg(sc->vm, mseg->segid);
296                 else
297                         name = NULL;    /* freed when 'cdev' is destroyed */
298         }
299 done:
300         free(name, M_VMMDEV);
301         return (error);
302 }
303
304 static int
305 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
306     uint64_t *regval)
307 {
308         int error, i;
309
310         error = 0;
311         for (i = 0; i < count; i++) {
312                 error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
313                 if (error)
314                         break;
315         }
316         return (error);
317 }
318
319 static int
320 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
321     uint64_t *regval)
322 {
323         int error, i;
324
325         error = 0;
326         for (i = 0; i < count; i++) {
327                 error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
328                 if (error)
329                         break;
330         }
331         return (error);
332 }
333
334 static int
335 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
336              struct thread *td)
337 {
338         int error, vcpu, state_changed, size;
339         cpuset_t *cpuset;
340         struct vmmdev_softc *sc;
341         struct vm_register *vmreg;
342         struct vm_seg_desc *vmsegdesc;
343         struct vm_register_set *vmregset;
344         struct vm_run *vmrun;
345         struct vm_exception *vmexc;
346         struct vm_lapic_irq *vmirq;
347         struct vm_lapic_msi *vmmsi;
348         struct vm_ioapic_irq *ioapic_irq;
349         struct vm_isa_irq *isa_irq;
350         struct vm_isa_irq_trigger *isa_irq_trigger;
351         struct vm_capability *vmcap;
352         struct vm_pptdev *pptdev;
353         struct vm_pptdev_mmio *pptmmio;
354         struct vm_pptdev_msi *pptmsi;
355         struct vm_pptdev_msix *pptmsix;
356         struct vm_nmi *vmnmi;
357         struct vm_stats *vmstats;
358         struct vm_stat_desc *statdesc;
359         struct vm_x2apic *x2apic;
360         struct vm_gpa_pte *gpapte;
361         struct vm_suspend *vmsuspend;
362         struct vm_gla2gpa *gg;
363         struct vm_activate_cpu *vac;
364         struct vm_cpuset *vm_cpuset;
365         struct vm_intinfo *vmii;
366         struct vm_rtc_time *rtctime;
367         struct vm_rtc_data *rtcdata;
368         struct vm_memmap *mm;
369         struct vm_cpu_topology *topology;
370         uint64_t *regvals;
371         int *regnums;
372
373         error = vmm_priv_check(curthread->td_ucred);
374         if (error)
375                 return (error);
376
377         sc = vmmdev_lookup2(cdev);
378         if (sc == NULL)
379                 return (ENXIO);
380
381         vcpu = -1;
382         state_changed = 0;
383
384         /*
385          * Some VMM ioctls can operate only on vcpus that are not running.
386          */
387         switch (cmd) {
388         case VM_RUN:
389         case VM_GET_REGISTER:
390         case VM_SET_REGISTER:
391         case VM_GET_SEGMENT_DESCRIPTOR:
392         case VM_SET_SEGMENT_DESCRIPTOR:
393         case VM_GET_REGISTER_SET:
394         case VM_SET_REGISTER_SET:
395         case VM_INJECT_EXCEPTION:
396         case VM_GET_CAPABILITY:
397         case VM_SET_CAPABILITY:
398         case VM_PPTDEV_MSI:
399         case VM_PPTDEV_MSIX:
400         case VM_SET_X2APIC_STATE:
401         case VM_GLA2GPA:
402         case VM_GLA2GPA_NOFAULT:
403         case VM_ACTIVATE_CPU:
404         case VM_SET_INTINFO:
405         case VM_GET_INTINFO:
406         case VM_RESTART_INSTRUCTION:
407                 /*
408                  * XXX fragile, handle with care
409                  * Assumes that the first field of the ioctl data is the vcpu.
410                  */
411                 vcpu = *(int *)data;
412                 error = vcpu_lock_one(sc, vcpu);
413                 if (error)
414                         goto done;
415                 state_changed = 1;
416                 break;
417
418         case VM_MAP_PPTDEV_MMIO:
419         case VM_BIND_PPTDEV:
420         case VM_UNBIND_PPTDEV:
421         case VM_ALLOC_MEMSEG:
422         case VM_MMAP_MEMSEG:
423         case VM_REINIT:
424                 /*
425                  * ioctls that operate on the entire virtual machine must
426                  * prevent all vcpus from running.
427                  */
428                 error = vcpu_lock_all(sc);
429                 if (error)
430                         goto done;
431                 state_changed = 2;
432                 break;
433
434         case VM_GET_MEMSEG:
435         case VM_MMAP_GETNEXT:
436                 /*
437                  * Lock a vcpu to make sure that the memory map cannot be
438                  * modified while it is being inspected.
439                  */
440                 vcpu = VM_MAXCPU - 1;
441                 error = vcpu_lock_one(sc, vcpu);
442                 if (error)
443                         goto done;
444                 state_changed = 1;
445                 break;
446
447         default:
448                 break;
449         }
450
451         switch(cmd) {
452         case VM_RUN:
453                 vmrun = (struct vm_run *)data;
454                 error = vm_run(sc->vm, vmrun);
455                 break;
456         case VM_SUSPEND:
457                 vmsuspend = (struct vm_suspend *)data;
458                 error = vm_suspend(sc->vm, vmsuspend->how);
459                 break;
460         case VM_REINIT:
461                 error = vm_reinit(sc->vm);
462                 break;
463         case VM_STAT_DESC: {
464                 statdesc = (struct vm_stat_desc *)data;
465                 error = vmm_stat_desc_copy(statdesc->index,
466                                         statdesc->desc, sizeof(statdesc->desc));
467                 break;
468         }
469         case VM_STATS: {
470                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
471                 vmstats = (struct vm_stats *)data;
472                 getmicrotime(&vmstats->tv);
473                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
474                                       &vmstats->num_entries, vmstats->statbuf);
475                 break;
476         }
477         case VM_PPTDEV_MSI:
478                 pptmsi = (struct vm_pptdev_msi *)data;
479                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
480                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
481                                       pptmsi->addr, pptmsi->msg,
482                                       pptmsi->numvec);
483                 break;
484         case VM_PPTDEV_MSIX:
485                 pptmsix = (struct vm_pptdev_msix *)data;
486                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
487                                        pptmsix->bus, pptmsix->slot, 
488                                        pptmsix->func, pptmsix->idx,
489                                        pptmsix->addr, pptmsix->msg,
490                                        pptmsix->vector_control);
491                 break;
492         case VM_MAP_PPTDEV_MMIO:
493                 pptmmio = (struct vm_pptdev_mmio *)data;
494                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
495                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
496                                      pptmmio->hpa);
497                 break;
498         case VM_BIND_PPTDEV:
499                 pptdev = (struct vm_pptdev *)data;
500                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
501                                          pptdev->func);
502                 break;
503         case VM_UNBIND_PPTDEV:
504                 pptdev = (struct vm_pptdev *)data;
505                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
506                                            pptdev->func);
507                 break;
508         case VM_INJECT_EXCEPTION:
509                 vmexc = (struct vm_exception *)data;
510                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
511                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
512                     vmexc->restart_instruction);
513                 break;
514         case VM_INJECT_NMI:
515                 vmnmi = (struct vm_nmi *)data;
516                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
517                 break;
518         case VM_LAPIC_IRQ:
519                 vmirq = (struct vm_lapic_irq *)data;
520                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
521                 break;
522         case VM_LAPIC_LOCAL_IRQ:
523                 vmirq = (struct vm_lapic_irq *)data;
524                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
525                     vmirq->vector);
526                 break;
527         case VM_LAPIC_MSI:
528                 vmmsi = (struct vm_lapic_msi *)data;
529                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
530                 break;
531         case VM_IOAPIC_ASSERT_IRQ:
532                 ioapic_irq = (struct vm_ioapic_irq *)data;
533                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
534                 break;
535         case VM_IOAPIC_DEASSERT_IRQ:
536                 ioapic_irq = (struct vm_ioapic_irq *)data;
537                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
538                 break;
539         case VM_IOAPIC_PULSE_IRQ:
540                 ioapic_irq = (struct vm_ioapic_irq *)data;
541                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
542                 break;
543         case VM_IOAPIC_PINCOUNT:
544                 *(int *)data = vioapic_pincount(sc->vm);
545                 break;
546         case VM_ISA_ASSERT_IRQ:
547                 isa_irq = (struct vm_isa_irq *)data;
548                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
549                 if (error == 0 && isa_irq->ioapic_irq != -1)
550                         error = vioapic_assert_irq(sc->vm,
551                             isa_irq->ioapic_irq);
552                 break;
553         case VM_ISA_DEASSERT_IRQ:
554                 isa_irq = (struct vm_isa_irq *)data;
555                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
556                 if (error == 0 && isa_irq->ioapic_irq != -1)
557                         error = vioapic_deassert_irq(sc->vm,
558                             isa_irq->ioapic_irq);
559                 break;
560         case VM_ISA_PULSE_IRQ:
561                 isa_irq = (struct vm_isa_irq *)data;
562                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
563                 if (error == 0 && isa_irq->ioapic_irq != -1)
564                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
565                 break;
566         case VM_ISA_SET_IRQ_TRIGGER:
567                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
568                 error = vatpic_set_irq_trigger(sc->vm,
569                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
570                 break;
571         case VM_MMAP_GETNEXT:
572                 mm = (struct vm_memmap *)data;
573                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
574                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
575                 break;
576         case VM_MMAP_MEMSEG:
577                 mm = (struct vm_memmap *)data;
578                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
579                     mm->len, mm->prot, mm->flags);
580                 break;
581         case VM_ALLOC_MEMSEG:
582                 error = alloc_memseg(sc, (struct vm_memseg *)data);
583                 break;
584         case VM_GET_MEMSEG:
585                 error = get_memseg(sc, (struct vm_memseg *)data);
586                 break;
587         case VM_GET_REGISTER:
588                 vmreg = (struct vm_register *)data;
589                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
590                                         &vmreg->regval);
591                 break;
592         case VM_SET_REGISTER:
593                 vmreg = (struct vm_register *)data;
594                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
595                                         vmreg->regval);
596                 break;
597         case VM_SET_SEGMENT_DESCRIPTOR:
598                 vmsegdesc = (struct vm_seg_desc *)data;
599                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
600                                         vmsegdesc->regnum,
601                                         &vmsegdesc->desc);
602                 break;
603         case VM_GET_SEGMENT_DESCRIPTOR:
604                 vmsegdesc = (struct vm_seg_desc *)data;
605                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
606                                         vmsegdesc->regnum,
607                                         &vmsegdesc->desc);
608                 break;
609         case VM_GET_REGISTER_SET:
610                 vmregset = (struct vm_register_set *)data;
611                 if (vmregset->count > VM_REG_LAST) {
612                         error = EINVAL;
613                         break;
614                 }
615                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
616                     M_WAITOK);
617                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
618                     M_WAITOK);
619                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
620                     vmregset->count);
621                 if (error == 0)
622                         error = vm_get_register_set(sc->vm, vmregset->cpuid,
623                             vmregset->count, regnums, regvals);
624                 if (error == 0)
625                         error = copyout(regvals, vmregset->regvals,
626                             sizeof(regvals[0]) * vmregset->count);
627                 free(regvals, M_VMMDEV);
628                 free(regnums, M_VMMDEV);
629                 break;
630         case VM_SET_REGISTER_SET:
631                 vmregset = (struct vm_register_set *)data;
632                 if (vmregset->count > VM_REG_LAST) {
633                         error = EINVAL;
634                         break;
635                 }
636                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
637                     M_WAITOK);
638                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
639                     M_WAITOK);
640                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
641                     vmregset->count);
642                 if (error == 0)
643                         error = copyin(vmregset->regvals, regvals,
644                             sizeof(regvals[0]) * vmregset->count);
645                 if (error == 0)
646                         error = vm_set_register_set(sc->vm, vmregset->cpuid,
647                             vmregset->count, regnums, regvals);
648                 free(regvals, M_VMMDEV);
649                 free(regnums, M_VMMDEV);
650                 break;
651         case VM_GET_CAPABILITY:
652                 vmcap = (struct vm_capability *)data;
653                 error = vm_get_capability(sc->vm, vmcap->cpuid,
654                                           vmcap->captype,
655                                           &vmcap->capval);
656                 break;
657         case VM_SET_CAPABILITY:
658                 vmcap = (struct vm_capability *)data;
659                 error = vm_set_capability(sc->vm, vmcap->cpuid,
660                                           vmcap->captype,
661                                           vmcap->capval);
662                 break;
663         case VM_SET_X2APIC_STATE:
664                 x2apic = (struct vm_x2apic *)data;
665                 error = vm_set_x2apic_state(sc->vm,
666                                             x2apic->cpuid, x2apic->state);
667                 break;
668         case VM_GET_X2APIC_STATE:
669                 x2apic = (struct vm_x2apic *)data;
670                 error = vm_get_x2apic_state(sc->vm,
671                                             x2apic->cpuid, &x2apic->state);
672                 break;
673         case VM_GET_GPA_PMAP:
674                 gpapte = (struct vm_gpa_pte *)data;
675                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
676                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
677                 error = 0;
678                 break;
679         case VM_GET_HPET_CAPABILITIES:
680                 error = vhpet_getcap((struct vm_hpet_cap *)data);
681                 break;
682         case VM_GLA2GPA: {
683                 CTASSERT(PROT_READ == VM_PROT_READ);
684                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
685                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
686                 gg = (struct vm_gla2gpa *)data;
687                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
688                     gg->prot, &gg->gpa, &gg->fault);
689                 KASSERT(error == 0 || error == EFAULT,
690                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
691                 break;
692         }
693         case VM_GLA2GPA_NOFAULT:
694                 gg = (struct vm_gla2gpa *)data;
695                 error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
696                     gg->gla, gg->prot, &gg->gpa, &gg->fault);
697                 KASSERT(error == 0 || error == EFAULT,
698                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
699                 break;
700         case VM_ACTIVATE_CPU:
701                 vac = (struct vm_activate_cpu *)data;
702                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
703                 break;
704         case VM_GET_CPUS:
705                 error = 0;
706                 vm_cpuset = (struct vm_cpuset *)data;
707                 size = vm_cpuset->cpusetsize;
708                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
709                         error = ERANGE;
710                         break;
711                 }
712                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
713                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
714                         *cpuset = vm_active_cpus(sc->vm);
715                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
716                         *cpuset = vm_suspended_cpus(sc->vm);
717                 else if (vm_cpuset->which == VM_DEBUG_CPUS)
718                         *cpuset = vm_debug_cpus(sc->vm);
719                 else
720                         error = EINVAL;
721                 if (error == 0)
722                         error = copyout(cpuset, vm_cpuset->cpus, size);
723                 free(cpuset, M_TEMP);
724                 break;
725         case VM_SUSPEND_CPU:
726                 vac = (struct vm_activate_cpu *)data;
727                 error = vm_suspend_cpu(sc->vm, vac->vcpuid);
728                 break;
729         case VM_RESUME_CPU:
730                 vac = (struct vm_activate_cpu *)data;
731                 error = vm_resume_cpu(sc->vm, vac->vcpuid);
732                 break;
733         case VM_SET_INTINFO:
734                 vmii = (struct vm_intinfo *)data;
735                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
736                 break;
737         case VM_GET_INTINFO:
738                 vmii = (struct vm_intinfo *)data;
739                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
740                     &vmii->info2);
741                 break;
742         case VM_RTC_WRITE:
743                 rtcdata = (struct vm_rtc_data *)data;
744                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
745                     rtcdata->value);
746                 break;
747         case VM_RTC_READ:
748                 rtcdata = (struct vm_rtc_data *)data;
749                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
750                     &rtcdata->value);
751                 break;
752         case VM_RTC_SETTIME:
753                 rtctime = (struct vm_rtc_time *)data;
754                 error = vrtc_set_time(sc->vm, rtctime->secs);
755                 break;
756         case VM_RTC_GETTIME:
757                 error = 0;
758                 rtctime = (struct vm_rtc_time *)data;
759                 rtctime->secs = vrtc_get_time(sc->vm);
760                 break;
761         case VM_RESTART_INSTRUCTION:
762                 error = vm_restart_instruction(sc->vm, vcpu);
763                 break;
764         case VM_SET_TOPOLOGY:
765                 topology = (struct vm_cpu_topology *)data;
766                 error = vm_set_topology(sc->vm, topology->sockets,
767                     topology->cores, topology->threads, topology->maxcpus);
768                 break;
769         case VM_GET_TOPOLOGY:
770                 topology = (struct vm_cpu_topology *)data;
771                 vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
772                     &topology->threads, &topology->maxcpus);
773                 error = 0;
774                 break;
775         default:
776                 error = ENOTTY;
777                 break;
778         }
779
780         if (state_changed == 1)
781                 vcpu_unlock_one(sc, vcpu);
782         else if (state_changed == 2)
783                 vcpu_unlock_all(sc);
784
785 done:
786         /* Make sure that no handler returns a bogus value like ERESTART */
787         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
788         return (error);
789 }
790
791 static int
792 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
793     struct vm_object **objp, int nprot)
794 {
795         struct vmmdev_softc *sc;
796         vm_paddr_t gpa;
797         size_t len;
798         vm_ooffset_t segoff, first, last;
799         int error, found, segid;
800         bool sysmem;
801
802         error = vmm_priv_check(curthread->td_ucred);
803         if (error)
804                 return (error);
805
806         first = *offset;
807         last = first + mapsize;
808         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
809                 return (EINVAL);
810
811         sc = vmmdev_lookup2(cdev);
812         if (sc == NULL) {
813                 /* virtual machine is in the process of being created */
814                 return (EINVAL);
815         }
816
817         /*
818          * Get a read lock on the guest memory map by freezing any vcpu.
819          */
820         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
821         if (error)
822                 return (error);
823
824         gpa = 0;
825         found = 0;
826         while (!found) {
827                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
828                     NULL, NULL);
829                 if (error)
830                         break;
831
832                 if (first >= gpa && last <= gpa + len)
833                         found = 1;
834                 else
835                         gpa += len;
836         }
837
838         if (found) {
839                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
840                 KASSERT(error == 0 && *objp != NULL,
841                     ("%s: invalid memory segment %d", __func__, segid));
842                 if (sysmem) {
843                         vm_object_reference(*objp);
844                         *offset = segoff + (first - gpa);
845                 } else {
846                         error = EINVAL;
847                 }
848         }
849         vcpu_unlock_one(sc, VM_MAXCPU - 1);
850         return (error);
851 }
852
853 static void
854 vmmdev_destroy(void *arg)
855 {
856         struct vmmdev_softc *sc = arg;
857         struct devmem_softc *dsc;
858         int error;
859
860         error = vcpu_lock_all(sc);
861         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
862
863         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
864                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
865                 SLIST_REMOVE_HEAD(&sc->devmem, link);
866                 free(dsc->name, M_VMMDEV);
867                 free(dsc, M_VMMDEV);
868         }
869
870         if (sc->cdev != NULL)
871                 destroy_dev(sc->cdev);
872
873         if (sc->vm != NULL)
874                 vm_destroy(sc->vm);
875
876         if ((sc->flags & VSC_LINKED) != 0) {
877                 mtx_lock(&vmmdev_mtx);
878                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
879                 mtx_unlock(&vmmdev_mtx);
880         }
881
882         free(sc, M_VMMDEV);
883 }
884
885 static int
886 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
887 {
888         int error;
889         char buf[VM_MAX_NAMELEN];
890         struct devmem_softc *dsc;
891         struct vmmdev_softc *sc;
892         struct cdev *cdev;
893
894         error = vmm_priv_check(req->td->td_ucred);
895         if (error)
896                 return (error);
897
898         strlcpy(buf, "beavis", sizeof(buf));
899         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
900         if (error != 0 || req->newptr == NULL)
901                 return (error);
902
903         mtx_lock(&vmmdev_mtx);
904         sc = vmmdev_lookup(buf);
905         if (sc == NULL || sc->cdev == NULL) {
906                 mtx_unlock(&vmmdev_mtx);
907                 return (EINVAL);
908         }
909
910         /*
911          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
912          * goes down to 0 so we should not do it again in the callback.
913          *
914          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
915          * is scheduled for destruction.
916          */
917         cdev = sc->cdev;
918         sc->cdev = NULL;                
919         mtx_unlock(&vmmdev_mtx);
920
921         /*
922          * Schedule all cdevs to be destroyed:
923          *
924          * - any new operations on the 'cdev' will return an error (ENXIO).
925          *
926          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
927          *   be destroyed and the callback will be invoked in a taskqueue
928          *   context.
929          *
930          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
931          */
932         SLIST_FOREACH(dsc, &sc->devmem, link) {
933                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
934                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
935         }
936         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
937         return (0);
938 }
939 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
940             CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
941             NULL, 0, sysctl_vmm_destroy, "A", NULL);
942
943 static struct cdevsw vmmdevsw = {
944         .d_name         = "vmmdev",
945         .d_version      = D_VERSION,
946         .d_ioctl        = vmmdev_ioctl,
947         .d_mmap_single  = vmmdev_mmap_single,
948         .d_read         = vmmdev_rw,
949         .d_write        = vmmdev_rw,
950 };
951
952 static int
953 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
954 {
955         int error;
956         struct vm *vm;
957         struct cdev *cdev;
958         struct vmmdev_softc *sc, *sc2;
959         char buf[VM_MAX_NAMELEN];
960
961         error = vmm_priv_check(req->td->td_ucred);
962         if (error)
963                 return (error);
964
965         strlcpy(buf, "beavis", sizeof(buf));
966         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
967         if (error != 0 || req->newptr == NULL)
968                 return (error);
969
970         mtx_lock(&vmmdev_mtx);
971         sc = vmmdev_lookup(buf);
972         mtx_unlock(&vmmdev_mtx);
973         if (sc != NULL)
974                 return (EEXIST);
975
976         error = vm_create(buf, &vm);
977         if (error != 0)
978                 return (error);
979
980         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
981         sc->vm = vm;
982         SLIST_INIT(&sc->devmem);
983
984         /*
985          * Lookup the name again just in case somebody sneaked in when we
986          * dropped the lock.
987          */
988         mtx_lock(&vmmdev_mtx);
989         sc2 = vmmdev_lookup(buf);
990         if (sc2 == NULL) {
991                 SLIST_INSERT_HEAD(&head, sc, link);
992                 sc->flags |= VSC_LINKED;
993         }
994         mtx_unlock(&vmmdev_mtx);
995
996         if (sc2 != NULL) {
997                 vmmdev_destroy(sc);
998                 return (EEXIST);
999         }
1000
1001         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
1002                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1003         if (error != 0) {
1004                 vmmdev_destroy(sc);
1005                 return (error);
1006         }
1007
1008         mtx_lock(&vmmdev_mtx);
1009         sc->cdev = cdev;
1010         sc->cdev->si_drv1 = sc;
1011         mtx_unlock(&vmmdev_mtx);
1012
1013         return (0);
1014 }
1015 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1016             CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
1017             NULL, 0, sysctl_vmm_create, "A", NULL);
1018
1019 void
1020 vmmdev_init(void)
1021 {
1022         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
1023         pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1024             "Allow use of vmm in a jail.");
1025 }
1026
1027 int
1028 vmmdev_cleanup(void)
1029 {
1030         int error;
1031
1032         if (SLIST_EMPTY(&head))
1033                 error = 0;
1034         else
1035                 error = EBUSY;
1036
1037         return (error);
1038 }
1039
1040 static int
1041 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1042     struct vm_object **objp, int nprot)
1043 {
1044         struct devmem_softc *dsc;
1045         vm_ooffset_t first, last;
1046         size_t seglen;
1047         int error;
1048         bool sysmem;
1049
1050         dsc = cdev->si_drv1;
1051         if (dsc == NULL) {
1052                 /* 'cdev' has been created but is not ready for use */
1053                 return (ENXIO);
1054         }
1055
1056         first = *offset;
1057         last = *offset + len;
1058         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1059                 return (EINVAL);
1060
1061         error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
1062         if (error)
1063                 return (error);
1064
1065         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1066         KASSERT(error == 0 && !sysmem && *objp != NULL,
1067             ("%s: invalid devmem segment %d", __func__, dsc->segid));
1068
1069         vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
1070
1071         if (seglen >= last) {
1072                 vm_object_reference(*objp);
1073                 return (0);
1074         } else {
1075                 return (EINVAL);
1076         }
1077 }
1078
1079 static struct cdevsw devmemsw = {
1080         .d_name         = "devmem",
1081         .d_version      = D_VERSION,
1082         .d_mmap_single  = devmem_mmap_single,
1083 };
1084
1085 static int
1086 devmem_create_cdev(const char *vmname, int segid, char *devname)
1087 {
1088         struct devmem_softc *dsc;
1089         struct vmmdev_softc *sc;
1090         struct cdev *cdev;
1091         int error;
1092
1093         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1094             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1095         if (error)
1096                 return (error);
1097
1098         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1099
1100         mtx_lock(&vmmdev_mtx);
1101         sc = vmmdev_lookup(vmname);
1102         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1103         if (sc->cdev == NULL) {
1104                 /* virtual machine is being created or destroyed */
1105                 mtx_unlock(&vmmdev_mtx);
1106                 free(dsc, M_VMMDEV);
1107                 destroy_dev_sched_cb(cdev, NULL, 0);
1108                 return (ENODEV);
1109         }
1110
1111         dsc->segid = segid;
1112         dsc->name = devname;
1113         dsc->cdev = cdev;
1114         dsc->sc = sc;
1115         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1116         mtx_unlock(&vmmdev_mtx);
1117
1118         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
1119         cdev->si_drv1 = dsc;
1120         return (0);
1121 }
1122
1123 static void
1124 devmem_destroy(void *arg)
1125 {
1126         struct devmem_softc *dsc = arg;
1127
1128         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1129         dsc->cdev = NULL;
1130         dsc->sc = NULL;
1131 }