]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/vmm_dev.c
Upgrade LDNS to 1.7.0.
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / vmm_dev.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/queue.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/malloc.h>
40 #include <sys/conf.h>
41 #include <sys/sysctl.h>
42 #include <sys/libkern.h>
43 #include <sys/ioccom.h>
44 #include <sys/mman.h>
45 #include <sys/uio.h>
46
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <machine/vmm_instruction_emul.h>
55 #include <machine/vmm_dev.h>
56
57 #include "vmm_lapic.h"
58 #include "vmm_stat.h"
59 #include "vmm_mem.h"
60 #include "io/ppt.h"
61 #include "io/vatpic.h"
62 #include "io/vioapic.h"
63 #include "io/vhpet.h"
64 #include "io/vrtc.h"
65
66 struct devmem_softc {
67         int     segid;
68         char    *name;
69         struct cdev *cdev;
70         struct vmmdev_softc *sc;
71         SLIST_ENTRY(devmem_softc) link;
72 };
73
74 struct vmmdev_softc {
75         struct vm       *vm;            /* vm instance cookie */
76         struct cdev     *cdev;
77         SLIST_ENTRY(vmmdev_softc) link;
78         SLIST_HEAD(, devmem_softc) devmem;
79         int             flags;
80 };
81 #define VSC_LINKED              0x01
82
83 static SLIST_HEAD(, vmmdev_softc) head;
84
85 static struct mtx vmmdev_mtx;
86
87 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
88
89 SYSCTL_DECL(_hw_vmm);
90
91 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
92 static void devmem_destroy(void *arg);
93
94 static int
95 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
96 {
97         int error;
98
99         if (vcpu < 0 || vcpu >= VM_MAXCPU)
100                 return (EINVAL);
101
102         error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
103         return (error);
104 }
105
106 static void
107 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
108 {
109         enum vcpu_state state;
110
111         state = vcpu_get_state(sc->vm, vcpu, NULL);
112         if (state != VCPU_FROZEN) {
113                 panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
114                     vcpu, state);
115         }
116
117         vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
118 }
119
120 static int
121 vcpu_lock_all(struct vmmdev_softc *sc)
122 {
123         int error, vcpu;
124
125         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
126                 error = vcpu_lock_one(sc, vcpu);
127                 if (error)
128                         break;
129         }
130
131         if (error) {
132                 while (--vcpu >= 0)
133                         vcpu_unlock_one(sc, vcpu);
134         }
135
136         return (error);
137 }
138
139 static void
140 vcpu_unlock_all(struct vmmdev_softc *sc)
141 {
142         int vcpu;
143
144         for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
145                 vcpu_unlock_one(sc, vcpu);
146 }
147
148 static struct vmmdev_softc *
149 vmmdev_lookup(const char *name)
150 {
151         struct vmmdev_softc *sc;
152
153 #ifdef notyet   /* XXX kernel is not compiled with invariants */
154         mtx_assert(&vmmdev_mtx, MA_OWNED);
155 #endif
156
157         SLIST_FOREACH(sc, &head, link) {
158                 if (strcmp(name, vm_name(sc->vm)) == 0)
159                         break;
160         }
161
162         return (sc);
163 }
164
165 static struct vmmdev_softc *
166 vmmdev_lookup2(struct cdev *cdev)
167 {
168
169         return (cdev->si_drv1);
170 }
171
172 static int
173 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
174 {
175         int error, off, c, prot;
176         vm_paddr_t gpa;
177         void *hpa, *cookie;
178         struct vmmdev_softc *sc;
179
180         sc = vmmdev_lookup2(cdev);
181         if (sc == NULL)
182                 return (ENXIO);
183
184         /*
185          * Get a read lock on the guest memory map by freezing any vcpu.
186          */
187         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
188         if (error)
189                 return (error);
190
191         prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
192         while (uio->uio_resid > 0 && error == 0) {
193                 gpa = uio->uio_offset;
194                 off = gpa & PAGE_MASK;
195                 c = min(uio->uio_resid, PAGE_SIZE - off);
196
197                 /*
198                  * The VM has a hole in its physical memory map. If we want to
199                  * use 'dd' to inspect memory beyond the hole we need to
200                  * provide bogus data for memory that lies in the hole.
201                  *
202                  * Since this device does not support lseek(2), dd(1) will
203                  * read(2) blocks of data to simulate the lseek(2).
204                  */
205                 hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
206                 if (hpa == NULL) {
207                         if (uio->uio_rw == UIO_READ)
208                                 error = uiomove(__DECONST(void *, zero_region),
209                                     c, uio);
210                         else
211                                 error = EFAULT;
212                 } else {
213                         error = uiomove(hpa, c, uio);
214                         vm_gpa_release(cookie);
215                 }
216         }
217         vcpu_unlock_one(sc, VM_MAXCPU - 1);
218         return (error);
219 }
220
221 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
222
223 static int
224 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
225 {
226         struct devmem_softc *dsc;
227         int error;
228         bool sysmem;
229
230         error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
231         if (error || mseg->len == 0)
232                 return (error);
233
234         if (!sysmem) {
235                 SLIST_FOREACH(dsc, &sc->devmem, link) {
236                         if (dsc->segid == mseg->segid)
237                                 break;
238                 }
239                 KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
240                     __func__, mseg->segid));
241                 error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
242         } else {
243                 bzero(mseg->name, sizeof(mseg->name));
244         }
245
246         return (error);
247 }
248
249 static int
250 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
251 {
252         char *name;
253         int error;
254         bool sysmem;
255
256         error = 0;
257         name = NULL;
258         sysmem = true;
259
260         if (VM_MEMSEG_NAME(mseg)) {
261                 sysmem = false;
262                 name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
263                 error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
264                 if (error)
265                         goto done;
266         }
267
268         error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
269         if (error)
270                 goto done;
271
272         if (VM_MEMSEG_NAME(mseg)) {
273                 error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
274                 if (error)
275                         vm_free_memseg(sc->vm, mseg->segid);
276                 else
277                         name = NULL;    /* freed when 'cdev' is destroyed */
278         }
279 done:
280         free(name, M_VMMDEV);
281         return (error);
282 }
283
284 static int
285 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
286     uint64_t *regval)
287 {
288         int error, i;
289
290         error = 0;
291         for (i = 0; i < count; i++) {
292                 error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
293                 if (error)
294                         break;
295         }
296         return (error);
297 }
298
299 static int
300 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
301     uint64_t *regval)
302 {
303         int error, i;
304
305         error = 0;
306         for (i = 0; i < count; i++) {
307                 error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
308                 if (error)
309                         break;
310         }
311         return (error);
312 }
313
314 static int
315 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
316              struct thread *td)
317 {
318         int error, vcpu, state_changed, size;
319         cpuset_t *cpuset;
320         struct vmmdev_softc *sc;
321         struct vm_register *vmreg;
322         struct vm_seg_desc *vmsegdesc;
323         struct vm_register_set *vmregset;
324         struct vm_run *vmrun;
325         struct vm_exception *vmexc;
326         struct vm_lapic_irq *vmirq;
327         struct vm_lapic_msi *vmmsi;
328         struct vm_ioapic_irq *ioapic_irq;
329         struct vm_isa_irq *isa_irq;
330         struct vm_isa_irq_trigger *isa_irq_trigger;
331         struct vm_capability *vmcap;
332         struct vm_pptdev *pptdev;
333         struct vm_pptdev_mmio *pptmmio;
334         struct vm_pptdev_msi *pptmsi;
335         struct vm_pptdev_msix *pptmsix;
336         struct vm_nmi *vmnmi;
337         struct vm_stats *vmstats;
338         struct vm_stat_desc *statdesc;
339         struct vm_x2apic *x2apic;
340         struct vm_gpa_pte *gpapte;
341         struct vm_suspend *vmsuspend;
342         struct vm_gla2gpa *gg;
343         struct vm_activate_cpu *vac;
344         struct vm_cpuset *vm_cpuset;
345         struct vm_intinfo *vmii;
346         struct vm_rtc_time *rtctime;
347         struct vm_rtc_data *rtcdata;
348         struct vm_memmap *mm;
349         struct vm_cpu_topology *topology;
350         uint64_t *regvals;
351         int *regnums;
352
353         sc = vmmdev_lookup2(cdev);
354         if (sc == NULL)
355                 return (ENXIO);
356
357         error = 0;
358         vcpu = -1;
359         state_changed = 0;
360
361         /*
362          * Some VMM ioctls can operate only on vcpus that are not running.
363          */
364         switch (cmd) {
365         case VM_RUN:
366         case VM_GET_REGISTER:
367         case VM_SET_REGISTER:
368         case VM_GET_SEGMENT_DESCRIPTOR:
369         case VM_SET_SEGMENT_DESCRIPTOR:
370         case VM_GET_REGISTER_SET:
371         case VM_SET_REGISTER_SET:
372         case VM_INJECT_EXCEPTION:
373         case VM_GET_CAPABILITY:
374         case VM_SET_CAPABILITY:
375         case VM_PPTDEV_MSI:
376         case VM_PPTDEV_MSIX:
377         case VM_SET_X2APIC_STATE:
378         case VM_GLA2GPA:
379         case VM_GLA2GPA_NOFAULT:
380         case VM_ACTIVATE_CPU:
381         case VM_SET_INTINFO:
382         case VM_GET_INTINFO:
383         case VM_RESTART_INSTRUCTION:
384                 /*
385                  * XXX fragile, handle with care
386                  * Assumes that the first field of the ioctl data is the vcpu.
387                  */
388                 vcpu = *(int *)data;
389                 error = vcpu_lock_one(sc, vcpu);
390                 if (error)
391                         goto done;
392                 state_changed = 1;
393                 break;
394
395         case VM_MAP_PPTDEV_MMIO:
396         case VM_BIND_PPTDEV:
397         case VM_UNBIND_PPTDEV:
398         case VM_ALLOC_MEMSEG:
399         case VM_MMAP_MEMSEG:
400         case VM_REINIT:
401                 /*
402                  * ioctls that operate on the entire virtual machine must
403                  * prevent all vcpus from running.
404                  */
405                 error = vcpu_lock_all(sc);
406                 if (error)
407                         goto done;
408                 state_changed = 2;
409                 break;
410
411         case VM_GET_MEMSEG:
412         case VM_MMAP_GETNEXT:
413                 /*
414                  * Lock a vcpu to make sure that the memory map cannot be
415                  * modified while it is being inspected.
416                  */
417                 vcpu = VM_MAXCPU - 1;
418                 error = vcpu_lock_one(sc, vcpu);
419                 if (error)
420                         goto done;
421                 state_changed = 1;
422                 break;
423
424         default:
425                 break;
426         }
427
428         switch(cmd) {
429         case VM_RUN:
430                 vmrun = (struct vm_run *)data;
431                 error = vm_run(sc->vm, vmrun);
432                 break;
433         case VM_SUSPEND:
434                 vmsuspend = (struct vm_suspend *)data;
435                 error = vm_suspend(sc->vm, vmsuspend->how);
436                 break;
437         case VM_REINIT:
438                 error = vm_reinit(sc->vm);
439                 break;
440         case VM_STAT_DESC: {
441                 statdesc = (struct vm_stat_desc *)data;
442                 error = vmm_stat_desc_copy(statdesc->index,
443                                         statdesc->desc, sizeof(statdesc->desc));
444                 break;
445         }
446         case VM_STATS: {
447                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
448                 vmstats = (struct vm_stats *)data;
449                 getmicrotime(&vmstats->tv);
450                 error = vmm_stat_copy(sc->vm, vmstats->cpuid,
451                                       &vmstats->num_entries, vmstats->statbuf);
452                 break;
453         }
454         case VM_PPTDEV_MSI:
455                 pptmsi = (struct vm_pptdev_msi *)data;
456                 error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
457                                       pptmsi->bus, pptmsi->slot, pptmsi->func,
458                                       pptmsi->addr, pptmsi->msg,
459                                       pptmsi->numvec);
460                 break;
461         case VM_PPTDEV_MSIX:
462                 pptmsix = (struct vm_pptdev_msix *)data;
463                 error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
464                                        pptmsix->bus, pptmsix->slot, 
465                                        pptmsix->func, pptmsix->idx,
466                                        pptmsix->addr, pptmsix->msg,
467                                        pptmsix->vector_control);
468                 break;
469         case VM_MAP_PPTDEV_MMIO:
470                 pptmmio = (struct vm_pptdev_mmio *)data;
471                 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
472                                      pptmmio->func, pptmmio->gpa, pptmmio->len,
473                                      pptmmio->hpa);
474                 break;
475         case VM_BIND_PPTDEV:
476                 pptdev = (struct vm_pptdev *)data;
477                 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
478                                          pptdev->func);
479                 break;
480         case VM_UNBIND_PPTDEV:
481                 pptdev = (struct vm_pptdev *)data;
482                 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
483                                            pptdev->func);
484                 break;
485         case VM_INJECT_EXCEPTION:
486                 vmexc = (struct vm_exception *)data;
487                 error = vm_inject_exception(sc->vm, vmexc->cpuid,
488                     vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
489                     vmexc->restart_instruction);
490                 break;
491         case VM_INJECT_NMI:
492                 vmnmi = (struct vm_nmi *)data;
493                 error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
494                 break;
495         case VM_LAPIC_IRQ:
496                 vmirq = (struct vm_lapic_irq *)data;
497                 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
498                 break;
499         case VM_LAPIC_LOCAL_IRQ:
500                 vmirq = (struct vm_lapic_irq *)data;
501                 error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
502                     vmirq->vector);
503                 break;
504         case VM_LAPIC_MSI:
505                 vmmsi = (struct vm_lapic_msi *)data;
506                 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
507                 break;
508         case VM_IOAPIC_ASSERT_IRQ:
509                 ioapic_irq = (struct vm_ioapic_irq *)data;
510                 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
511                 break;
512         case VM_IOAPIC_DEASSERT_IRQ:
513                 ioapic_irq = (struct vm_ioapic_irq *)data;
514                 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
515                 break;
516         case VM_IOAPIC_PULSE_IRQ:
517                 ioapic_irq = (struct vm_ioapic_irq *)data;
518                 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
519                 break;
520         case VM_IOAPIC_PINCOUNT:
521                 *(int *)data = vioapic_pincount(sc->vm);
522                 break;
523         case VM_ISA_ASSERT_IRQ:
524                 isa_irq = (struct vm_isa_irq *)data;
525                 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
526                 if (error == 0 && isa_irq->ioapic_irq != -1)
527                         error = vioapic_assert_irq(sc->vm,
528                             isa_irq->ioapic_irq);
529                 break;
530         case VM_ISA_DEASSERT_IRQ:
531                 isa_irq = (struct vm_isa_irq *)data;
532                 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
533                 if (error == 0 && isa_irq->ioapic_irq != -1)
534                         error = vioapic_deassert_irq(sc->vm,
535                             isa_irq->ioapic_irq);
536                 break;
537         case VM_ISA_PULSE_IRQ:
538                 isa_irq = (struct vm_isa_irq *)data;
539                 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
540                 if (error == 0 && isa_irq->ioapic_irq != -1)
541                         error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
542                 break;
543         case VM_ISA_SET_IRQ_TRIGGER:
544                 isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
545                 error = vatpic_set_irq_trigger(sc->vm,
546                     isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
547                 break;
548         case VM_MMAP_GETNEXT:
549                 mm = (struct vm_memmap *)data;
550                 error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
551                     &mm->segoff, &mm->len, &mm->prot, &mm->flags);
552                 break;
553         case VM_MMAP_MEMSEG:
554                 mm = (struct vm_memmap *)data;
555                 error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
556                     mm->len, mm->prot, mm->flags);
557                 break;
558         case VM_ALLOC_MEMSEG:
559                 error = alloc_memseg(sc, (struct vm_memseg *)data);
560                 break;
561         case VM_GET_MEMSEG:
562                 error = get_memseg(sc, (struct vm_memseg *)data);
563                 break;
564         case VM_GET_REGISTER:
565                 vmreg = (struct vm_register *)data;
566                 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
567                                         &vmreg->regval);
568                 break;
569         case VM_SET_REGISTER:
570                 vmreg = (struct vm_register *)data;
571                 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
572                                         vmreg->regval);
573                 break;
574         case VM_SET_SEGMENT_DESCRIPTOR:
575                 vmsegdesc = (struct vm_seg_desc *)data;
576                 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
577                                         vmsegdesc->regnum,
578                                         &vmsegdesc->desc);
579                 break;
580         case VM_GET_SEGMENT_DESCRIPTOR:
581                 vmsegdesc = (struct vm_seg_desc *)data;
582                 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
583                                         vmsegdesc->regnum,
584                                         &vmsegdesc->desc);
585                 break;
586         case VM_GET_REGISTER_SET:
587                 vmregset = (struct vm_register_set *)data;
588                 if (vmregset->count > VM_REG_LAST) {
589                         error = EINVAL;
590                         break;
591                 }
592                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
593                     M_WAITOK);
594                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
595                     M_WAITOK);
596                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
597                     vmregset->count);
598                 if (error == 0)
599                         error = vm_get_register_set(sc->vm, vmregset->cpuid,
600                             vmregset->count, regnums, regvals);
601                 if (error == 0)
602                         error = copyout(regvals, vmregset->regvals,
603                             sizeof(regvals[0]) * vmregset->count);
604                 free(regvals, M_VMMDEV);
605                 free(regnums, M_VMMDEV);
606                 break;
607         case VM_SET_REGISTER_SET:
608                 vmregset = (struct vm_register_set *)data;
609                 if (vmregset->count > VM_REG_LAST) {
610                         error = EINVAL;
611                         break;
612                 }
613                 regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
614                     M_WAITOK);
615                 regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
616                     M_WAITOK);
617                 error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
618                     vmregset->count);
619                 if (error == 0)
620                         error = copyin(vmregset->regvals, regvals,
621                             sizeof(regvals[0]) * vmregset->count);
622                 if (error == 0)
623                         error = vm_set_register_set(sc->vm, vmregset->cpuid,
624                             vmregset->count, regnums, regvals);
625                 free(regvals, M_VMMDEV);
626                 free(regnums, M_VMMDEV);
627                 break;
628         case VM_GET_CAPABILITY:
629                 vmcap = (struct vm_capability *)data;
630                 error = vm_get_capability(sc->vm, vmcap->cpuid,
631                                           vmcap->captype,
632                                           &vmcap->capval);
633                 break;
634         case VM_SET_CAPABILITY:
635                 vmcap = (struct vm_capability *)data;
636                 error = vm_set_capability(sc->vm, vmcap->cpuid,
637                                           vmcap->captype,
638                                           vmcap->capval);
639                 break;
640         case VM_SET_X2APIC_STATE:
641                 x2apic = (struct vm_x2apic *)data;
642                 error = vm_set_x2apic_state(sc->vm,
643                                             x2apic->cpuid, x2apic->state);
644                 break;
645         case VM_GET_X2APIC_STATE:
646                 x2apic = (struct vm_x2apic *)data;
647                 error = vm_get_x2apic_state(sc->vm,
648                                             x2apic->cpuid, &x2apic->state);
649                 break;
650         case VM_GET_GPA_PMAP:
651                 gpapte = (struct vm_gpa_pte *)data;
652                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
653                                  gpapte->gpa, gpapte->pte, &gpapte->ptenum);
654                 error = 0;
655                 break;
656         case VM_GET_HPET_CAPABILITIES:
657                 error = vhpet_getcap((struct vm_hpet_cap *)data);
658                 break;
659         case VM_GLA2GPA: {
660                 CTASSERT(PROT_READ == VM_PROT_READ);
661                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
662                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
663                 gg = (struct vm_gla2gpa *)data;
664                 error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
665                     gg->prot, &gg->gpa, &gg->fault);
666                 KASSERT(error == 0 || error == EFAULT,
667                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
668                 break;
669         }
670         case VM_GLA2GPA_NOFAULT:
671                 gg = (struct vm_gla2gpa *)data;
672                 error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
673                     gg->gla, gg->prot, &gg->gpa, &gg->fault);
674                 KASSERT(error == 0 || error == EFAULT,
675                     ("%s: vm_gla2gpa unknown error %d", __func__, error));
676                 break;
677         case VM_ACTIVATE_CPU:
678                 vac = (struct vm_activate_cpu *)data;
679                 error = vm_activate_cpu(sc->vm, vac->vcpuid);
680                 break;
681         case VM_GET_CPUS:
682                 error = 0;
683                 vm_cpuset = (struct vm_cpuset *)data;
684                 size = vm_cpuset->cpusetsize;
685                 if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
686                         error = ERANGE;
687                         break;
688                 }
689                 cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
690                 if (vm_cpuset->which == VM_ACTIVE_CPUS)
691                         *cpuset = vm_active_cpus(sc->vm);
692                 else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
693                         *cpuset = vm_suspended_cpus(sc->vm);
694                 else if (vm_cpuset->which == VM_DEBUG_CPUS)
695                         *cpuset = vm_debug_cpus(sc->vm);
696                 else
697                         error = EINVAL;
698                 if (error == 0)
699                         error = copyout(cpuset, vm_cpuset->cpus, size);
700                 free(cpuset, M_TEMP);
701                 break;
702         case VM_SUSPEND_CPU:
703                 vac = (struct vm_activate_cpu *)data;
704                 error = vm_suspend_cpu(sc->vm, vac->vcpuid);
705                 break;
706         case VM_RESUME_CPU:
707                 vac = (struct vm_activate_cpu *)data;
708                 error = vm_resume_cpu(sc->vm, vac->vcpuid);
709                 break;
710         case VM_SET_INTINFO:
711                 vmii = (struct vm_intinfo *)data;
712                 error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
713                 break;
714         case VM_GET_INTINFO:
715                 vmii = (struct vm_intinfo *)data;
716                 error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
717                     &vmii->info2);
718                 break;
719         case VM_RTC_WRITE:
720                 rtcdata = (struct vm_rtc_data *)data;
721                 error = vrtc_nvram_write(sc->vm, rtcdata->offset,
722                     rtcdata->value);
723                 break;
724         case VM_RTC_READ:
725                 rtcdata = (struct vm_rtc_data *)data;
726                 error = vrtc_nvram_read(sc->vm, rtcdata->offset,
727                     &rtcdata->value);
728                 break;
729         case VM_RTC_SETTIME:
730                 rtctime = (struct vm_rtc_time *)data;
731                 error = vrtc_set_time(sc->vm, rtctime->secs);
732                 break;
733         case VM_RTC_GETTIME:
734                 error = 0;
735                 rtctime = (struct vm_rtc_time *)data;
736                 rtctime->secs = vrtc_get_time(sc->vm);
737                 break;
738         case VM_RESTART_INSTRUCTION:
739                 error = vm_restart_instruction(sc->vm, vcpu);
740                 break;
741         case VM_SET_TOPOLOGY:
742                 topology = (struct vm_cpu_topology *)data;
743                 error = vm_set_topology(sc->vm, topology->sockets,
744                     topology->cores, topology->threads, topology->maxcpus);
745                 break;
746         case VM_GET_TOPOLOGY:
747                 topology = (struct vm_cpu_topology *)data;
748                 vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
749                     &topology->threads, &topology->maxcpus);
750                 error = 0;
751                 break;
752         default:
753                 error = ENOTTY;
754                 break;
755         }
756
757         if (state_changed == 1)
758                 vcpu_unlock_one(sc, vcpu);
759         else if (state_changed == 2)
760                 vcpu_unlock_all(sc);
761
762 done:
763         /* Make sure that no handler returns a bogus value like ERESTART */
764         KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
765         return (error);
766 }
767
768 static int
769 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
770     struct vm_object **objp, int nprot)
771 {
772         struct vmmdev_softc *sc;
773         vm_paddr_t gpa;
774         size_t len;
775         vm_ooffset_t segoff, first, last;
776         int error, found, segid;
777         bool sysmem;
778
779         first = *offset;
780         last = first + mapsize;
781         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
782                 return (EINVAL);
783
784         sc = vmmdev_lookup2(cdev);
785         if (sc == NULL) {
786                 /* virtual machine is in the process of being created */
787                 return (EINVAL);
788         }
789
790         /*
791          * Get a read lock on the guest memory map by freezing any vcpu.
792          */
793         error = vcpu_lock_one(sc, VM_MAXCPU - 1);
794         if (error)
795                 return (error);
796
797         gpa = 0;
798         found = 0;
799         while (!found) {
800                 error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
801                     NULL, NULL);
802                 if (error)
803                         break;
804
805                 if (first >= gpa && last <= gpa + len)
806                         found = 1;
807                 else
808                         gpa += len;
809         }
810
811         if (found) {
812                 error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
813                 KASSERT(error == 0 && *objp != NULL,
814                     ("%s: invalid memory segment %d", __func__, segid));
815                 if (sysmem) {
816                         vm_object_reference(*objp);
817                         *offset = segoff + (first - gpa);
818                 } else {
819                         error = EINVAL;
820                 }
821         }
822         vcpu_unlock_one(sc, VM_MAXCPU - 1);
823         return (error);
824 }
825
826 static void
827 vmmdev_destroy(void *arg)
828 {
829         struct vmmdev_softc *sc = arg;
830         struct devmem_softc *dsc;
831         int error;
832
833         error = vcpu_lock_all(sc);
834         KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
835
836         while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
837                 KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
838                 SLIST_REMOVE_HEAD(&sc->devmem, link);
839                 free(dsc->name, M_VMMDEV);
840                 free(dsc, M_VMMDEV);
841         }
842
843         if (sc->cdev != NULL)
844                 destroy_dev(sc->cdev);
845
846         if (sc->vm != NULL)
847                 vm_destroy(sc->vm);
848
849         if ((sc->flags & VSC_LINKED) != 0) {
850                 mtx_lock(&vmmdev_mtx);
851                 SLIST_REMOVE(&head, sc, vmmdev_softc, link);
852                 mtx_unlock(&vmmdev_mtx);
853         }
854
855         free(sc, M_VMMDEV);
856 }
857
858 static int
859 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
860 {
861         int error;
862         char buf[VM_MAX_NAMELEN];
863         struct devmem_softc *dsc;
864         struct vmmdev_softc *sc;
865         struct cdev *cdev;
866
867         strlcpy(buf, "beavis", sizeof(buf));
868         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
869         if (error != 0 || req->newptr == NULL)
870                 return (error);
871
872         mtx_lock(&vmmdev_mtx);
873         sc = vmmdev_lookup(buf);
874         if (sc == NULL || sc->cdev == NULL) {
875                 mtx_unlock(&vmmdev_mtx);
876                 return (EINVAL);
877         }
878
879         /*
880          * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
881          * goes down to 0 so we should not do it again in the callback.
882          *
883          * Setting 'sc->cdev' to NULL is also used to indicate that the VM
884          * is scheduled for destruction.
885          */
886         cdev = sc->cdev;
887         sc->cdev = NULL;                
888         mtx_unlock(&vmmdev_mtx);
889
890         /*
891          * Schedule all cdevs to be destroyed:
892          *
893          * - any new operations on the 'cdev' will return an error (ENXIO).
894          *
895          * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
896          *   be destroyed and the callback will be invoked in a taskqueue
897          *   context.
898          *
899          * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
900          */
901         SLIST_FOREACH(dsc, &sc->devmem, link) {
902                 KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
903                 destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
904         }
905         destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
906         return (0);
907 }
908 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
909             NULL, 0, sysctl_vmm_destroy, "A", NULL);
910
911 static struct cdevsw vmmdevsw = {
912         .d_name         = "vmmdev",
913         .d_version      = D_VERSION,
914         .d_ioctl        = vmmdev_ioctl,
915         .d_mmap_single  = vmmdev_mmap_single,
916         .d_read         = vmmdev_rw,
917         .d_write        = vmmdev_rw,
918 };
919
920 static int
921 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
922 {
923         int error;
924         struct vm *vm;
925         struct cdev *cdev;
926         struct vmmdev_softc *sc, *sc2;
927         char buf[VM_MAX_NAMELEN];
928
929         strlcpy(buf, "beavis", sizeof(buf));
930         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
931         if (error != 0 || req->newptr == NULL)
932                 return (error);
933
934         mtx_lock(&vmmdev_mtx);
935         sc = vmmdev_lookup(buf);
936         mtx_unlock(&vmmdev_mtx);
937         if (sc != NULL)
938                 return (EEXIST);
939
940         error = vm_create(buf, &vm);
941         if (error != 0)
942                 return (error);
943
944         sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
945         sc->vm = vm;
946         SLIST_INIT(&sc->devmem);
947
948         /*
949          * Lookup the name again just in case somebody sneaked in when we
950          * dropped the lock.
951          */
952         mtx_lock(&vmmdev_mtx);
953         sc2 = vmmdev_lookup(buf);
954         if (sc2 == NULL) {
955                 SLIST_INSERT_HEAD(&head, sc, link);
956                 sc->flags |= VSC_LINKED;
957         }
958         mtx_unlock(&vmmdev_mtx);
959
960         if (sc2 != NULL) {
961                 vmmdev_destroy(sc);
962                 return (EEXIST);
963         }
964
965         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
966                            UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
967         if (error != 0) {
968                 vmmdev_destroy(sc);
969                 return (error);
970         }
971
972         mtx_lock(&vmmdev_mtx);
973         sc->cdev = cdev;
974         sc->cdev->si_drv1 = sc;
975         mtx_unlock(&vmmdev_mtx);
976
977         return (0);
978 }
979 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
980             NULL, 0, sysctl_vmm_create, "A", NULL);
981
982 void
983 vmmdev_init(void)
984 {
985         mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
986 }
987
988 int
989 vmmdev_cleanup(void)
990 {
991         int error;
992
993         if (SLIST_EMPTY(&head))
994                 error = 0;
995         else
996                 error = EBUSY;
997
998         return (error);
999 }
1000
1001 static int
1002 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1003     struct vm_object **objp, int nprot)
1004 {
1005         struct devmem_softc *dsc;
1006         vm_ooffset_t first, last;
1007         size_t seglen;
1008         int error;
1009         bool sysmem;
1010
1011         dsc = cdev->si_drv1;
1012         if (dsc == NULL) {
1013                 /* 'cdev' has been created but is not ready for use */
1014                 return (ENXIO);
1015         }
1016
1017         first = *offset;
1018         last = *offset + len;
1019         if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1020                 return (EINVAL);
1021
1022         error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
1023         if (error)
1024                 return (error);
1025
1026         error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1027         KASSERT(error == 0 && !sysmem && *objp != NULL,
1028             ("%s: invalid devmem segment %d", __func__, dsc->segid));
1029
1030         vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
1031
1032         if (seglen >= last) {
1033                 vm_object_reference(*objp);
1034                 return (0);
1035         } else {
1036                 return (EINVAL);
1037         }
1038 }
1039
1040 static struct cdevsw devmemsw = {
1041         .d_name         = "devmem",
1042         .d_version      = D_VERSION,
1043         .d_mmap_single  = devmem_mmap_single,
1044 };
1045
1046 static int
1047 devmem_create_cdev(const char *vmname, int segid, char *devname)
1048 {
1049         struct devmem_softc *dsc;
1050         struct vmmdev_softc *sc;
1051         struct cdev *cdev;
1052         int error;
1053
1054         error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1055             UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1056         if (error)
1057                 return (error);
1058
1059         dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1060
1061         mtx_lock(&vmmdev_mtx);
1062         sc = vmmdev_lookup(vmname);
1063         KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1064         if (sc->cdev == NULL) {
1065                 /* virtual machine is being created or destroyed */
1066                 mtx_unlock(&vmmdev_mtx);
1067                 free(dsc, M_VMMDEV);
1068                 destroy_dev_sched_cb(cdev, NULL, 0);
1069                 return (ENODEV);
1070         }
1071
1072         dsc->segid = segid;
1073         dsc->name = devname;
1074         dsc->cdev = cdev;
1075         dsc->sc = sc;
1076         SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1077         mtx_unlock(&vmmdev_mtx);
1078
1079         /* The 'cdev' is ready for use after 'si_drv1' is initialized */
1080         cdev->si_drv1 = dsc;
1081         return (0);
1082 }
1083
1084 static void
1085 devmem_destroy(void *arg)
1086 {
1087         struct devmem_softc *dsc = arg;
1088
1089         KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1090         dsc->cdev = NULL;
1091         dsc->sc = NULL;
1092 }