2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2011 NetApp, Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/module.h>
40 #include <sys/pciio.h>
43 #include <sys/sysctl.h>
45 #include <dev/pci/pcivar.h>
46 #include <dev/pci/pcireg.h>
48 #include <machine/resource.h>
50 #include <machine/vmm.h>
51 #include <machine/vmm_dev.h>
53 #include "vmm_lapic.h"
61 #define MAX_MSIMSGS 32
64 * If the MSI-X table is located in the middle of a BAR then that MMIO
65 * region gets split into two segments - one segment above the MSI-X table
66 * and the other segment below the MSI-X table - with a hole in place of
67 * the MSI-X table so accesses to it can be trapped and emulated.
69 * So, allocate a MMIO segment for each BAR register + 1 additional segment.
71 #define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1)
73 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
75 struct pptintr_arg { /* pptintr(pptintr_arg) */
76 struct pptdev *pptdev;
89 struct vm *vm; /* owner of this device */
90 TAILQ_ENTRY(pptdev) next;
91 struct pptseg mmio[MAX_MMIOSEGS];
93 int num_msgs; /* guest state */
95 int startrid; /* host state */
96 struct resource *res[MAX_MSIMSGS];
97 void *cookie[MAX_MSIMSGS];
98 struct pptintr_arg arg[MAX_MSIMSGS];
105 struct resource *msix_table_res;
106 struct resource **res;
108 struct pptintr_arg *arg;
112 SYSCTL_DECL(_hw_vmm);
113 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
115 static int num_pptdevs;
116 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
117 "number of pci passthru devices");
119 static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
122 ppt_probe(device_t dev)
125 struct pci_devinfo *dinfo;
127 dinfo = (struct pci_devinfo *)device_get_ivars(dev);
129 bus = pci_get_bus(dev);
130 slot = pci_get_slot(dev);
131 func = pci_get_function(dev);
134 * To qualify as a pci passthrough device a device must:
135 * - be allowed by administrator to be used in this role
136 * - be an endpoint device
138 if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
140 else if (vmm_is_pptdev(bus, slot, func))
144 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
145 * SR-IOV infrastructure specified as "ppt" passthrough devices.
146 * All normal devices that did not have "ppt" specified as their
147 * driver will not be matched by this.
149 return (BUS_PROBE_NOWILDCARD);
153 ppt_attach(device_t dev)
157 ppt = device_get_softc(dev);
159 iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
161 TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
165 device_printf(dev, "attached\n");
171 ppt_detach(device_t dev)
175 ppt = device_get_softc(dev);
180 TAILQ_REMOVE(&pptdev_list, ppt, next);
181 pci_disable_busmaster(dev);
182 iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
187 static device_method_t ppt_methods[] = {
188 /* Device interface */
189 DEVMETHOD(device_probe, ppt_probe),
190 DEVMETHOD(device_attach, ppt_attach),
191 DEVMETHOD(device_detach, ppt_detach),
195 static devclass_t ppt_devclass;
196 DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
197 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
199 static struct pptdev *
200 ppt_find(int bus, int slot, int func)
206 TAILQ_FOREACH(ppt, &pptdev_list, next) {
208 b = pci_get_bus(dev);
209 s = pci_get_slot(dev);
210 f = pci_get_function(dev);
211 if (bus == b && slot == s && func == f)
218 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
223 for (i = 0; i < MAX_MMIOSEGS; i++) {
227 (void)vm_unmap_mmio(vm, seg->gpa, seg->len);
228 bzero(seg, sizeof(struct pptseg));
233 ppt_teardown_msi(struct pptdev *ppt)
237 struct resource *res;
239 if (ppt->msi.num_msgs == 0)
242 for (i = 0; i < ppt->msi.num_msgs; i++) {
243 rid = ppt->msi.startrid + i;
244 res = ppt->msi.res[i];
245 cookie = ppt->msi.cookie[i];
248 bus_teardown_intr(ppt->dev, res, cookie);
251 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
253 ppt->msi.res[i] = NULL;
254 ppt->msi.cookie[i] = NULL;
257 if (ppt->msi.startrid == 1)
258 pci_release_msi(ppt->dev);
260 ppt->msi.num_msgs = 0;
264 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
267 struct resource *res;
270 rid = ppt->msix.startrid + idx;
271 res = ppt->msix.res[idx];
272 cookie = ppt->msix.cookie[idx];
275 bus_teardown_intr(ppt->dev, res, cookie);
278 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
280 ppt->msix.res[idx] = NULL;
281 ppt->msix.cookie[idx] = NULL;
285 ppt_teardown_msix(struct pptdev *ppt)
289 if (ppt->msix.num_msgs == 0)
292 for (i = 0; i < ppt->msix.num_msgs; i++)
293 ppt_teardown_msix_intr(ppt, i);
295 if (ppt->msix.msix_table_res) {
296 bus_release_resource(ppt->dev, SYS_RES_MEMORY,
297 ppt->msix.msix_table_rid,
298 ppt->msix.msix_table_res);
299 ppt->msix.msix_table_res = NULL;
300 ppt->msix.msix_table_rid = 0;
303 free(ppt->msix.res, M_PPTMSIX);
304 free(ppt->msix.cookie, M_PPTMSIX);
305 free(ppt->msix.arg, M_PPTMSIX);
307 pci_release_msi(ppt->dev);
309 ppt->msix.num_msgs = 0;
313 ppt_avail_devices(void)
316 return (num_pptdevs);
320 ppt_assigned_devices(struct vm *vm)
326 TAILQ_FOREACH(ppt, &pptdev_list, next) {
334 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
340 TAILQ_FOREACH(ppt, &pptdev_list, next) {
344 for (i = 0; i < MAX_MMIOSEGS; i++) {
348 if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
357 ppt_pci_reset(device_t dev)
362 max(pcie_get_max_completion_timeout(dev) / 1000, 10),
367 * If FLR fails, attempt a power-management reset by cycling
368 * the device in/out of D3 state.
369 * PCI spec says we can only go into D3 state from D0 state.
370 * Transition from D[12] into D0 before going to D3 state.
372 ps = pci_get_powerstate(dev);
373 if (ps != PCI_POWERSTATE_D0 && ps != PCI_POWERSTATE_D3)
374 pci_set_powerstate(dev, PCI_POWERSTATE_D0);
375 if (pci_get_powerstate(dev) != PCI_POWERSTATE_D3)
376 pci_set_powerstate(dev, PCI_POWERSTATE_D3);
377 pci_set_powerstate(dev, ps);
381 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
385 ppt = ppt_find(bus, slot, func);
388 * If this device is owned by a different VM then we
389 * cannot change its owner.
391 if (ppt->vm != NULL && ppt->vm != vm)
394 pci_save_state(ppt->dev);
395 ppt_pci_reset(ppt->dev);
396 pci_restore_state(ppt->dev);
398 iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
405 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
409 ppt = ppt_find(bus, slot, func);
412 * If this device is not owned by this 'vm' then bail out.
417 pci_save_state(ppt->dev);
418 ppt_pci_reset(ppt->dev);
419 pci_restore_state(ppt->dev);
420 ppt_unmap_mmio(vm, ppt);
421 ppt_teardown_msi(ppt);
422 ppt_teardown_msix(ppt);
423 iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
431 ppt_unassign_all(struct vm *vm)
437 TAILQ_FOREACH(ppt, &pptdev_list, next) {
440 bus = pci_get_bus(dev);
441 slot = pci_get_slot(dev);
442 func = pci_get_function(dev);
443 vm_unassign_pptdev(vm, bus, slot, func);
451 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
452 vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
458 ppt = ppt_find(bus, slot, func);
463 for (i = 0; i < MAX_MMIOSEGS; i++) {
466 error = vm_map_mmio(vm, gpa, len, hpa);
483 struct pptintr_arg *pptarg;
486 ppt = pptarg->pptdev;
489 lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
493 * This is not expected to happen - panic?
498 * For legacy interrupts give other filters a chance in case
499 * the interrupt was not generated by the passthrough device.
501 if (ppt->msi.startrid == 0)
502 return (FILTER_STRAY);
504 return (FILTER_HANDLED);
508 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
509 uint64_t addr, uint64_t msg, int numvec)
512 int msi_count, startrid, error, tmp;
515 if (numvec < 0 || numvec > MAX_MSIMSGS)
518 ppt = ppt_find(bus, slot, func);
521 if (ppt->vm != vm) /* Make sure we own this device */
524 /* Free any allocated resources */
525 ppt_teardown_msi(ppt);
527 if (numvec == 0) /* nothing more to do */
531 msi_count = pci_msi_count(ppt->dev);
532 if (msi_count == 0) {
533 startrid = 0; /* legacy interrupt */
535 flags |= RF_SHAREABLE;
537 startrid = 1; /* MSI */
540 * The device must be capable of supporting the number of vectors
541 * the guest wants to allocate.
543 if (numvec > msi_count)
547 * Make sure that we can allocate all the MSI vectors that are needed
552 error = pci_alloc_msi(ppt->dev, &tmp);
555 else if (tmp != numvec) {
556 pci_release_msi(ppt->dev);
563 ppt->msi.startrid = startrid;
566 * Allocate the irq resource and attach it to the interrupt handler.
568 for (i = 0; i < numvec; i++) {
569 ppt->msi.num_msgs = i + 1;
570 ppt->msi.cookie[i] = NULL;
573 ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
575 if (ppt->msi.res[i] == NULL)
578 ppt->msi.arg[i].pptdev = ppt;
579 ppt->msi.arg[i].addr = addr;
580 ppt->msi.arg[i].msg_data = msg + i;
582 error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
583 INTR_TYPE_NET | INTR_MPSAFE,
584 pptintr, NULL, &ppt->msi.arg[i],
585 &ppt->msi.cookie[i]);
591 ppt_teardown_msi(ppt);
599 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
600 int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
603 struct pci_devinfo *dinfo;
604 int numvec, alloced, rid, error;
605 size_t res_size, cookie_size, arg_size;
607 ppt = ppt_find(bus, slot, func);
610 if (ppt->vm != vm) /* Make sure we own this device */
613 dinfo = device_get_ivars(ppt->dev);
618 * First-time configuration:
619 * Allocate the MSI-X table
620 * Allocate the IRQ resources
621 * Set up some variables in ppt->msix
623 if (ppt->msix.num_msgs == 0) {
624 numvec = pci_msix_count(ppt->dev);
628 ppt->msix.startrid = 1;
629 ppt->msix.num_msgs = numvec;
631 res_size = numvec * sizeof(ppt->msix.res[0]);
632 cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
633 arg_size = numvec * sizeof(ppt->msix.arg[0]);
635 ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
636 ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
638 ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
640 rid = dinfo->cfg.msix.msix_table_bar;
641 ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
642 SYS_RES_MEMORY, &rid, RF_ACTIVE);
644 if (ppt->msix.msix_table_res == NULL) {
645 ppt_teardown_msix(ppt);
648 ppt->msix.msix_table_rid = rid;
651 error = pci_alloc_msix(ppt->dev, &alloced);
652 if (error || alloced != numvec) {
653 ppt_teardown_msix(ppt);
654 return (error == 0 ? ENOSPC: error);
658 if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
659 /* Tear down the IRQ if it's already set up */
660 ppt_teardown_msix_intr(ppt, idx);
662 /* Allocate the IRQ resource */
663 ppt->msix.cookie[idx] = NULL;
664 rid = ppt->msix.startrid + idx;
665 ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
667 if (ppt->msix.res[idx] == NULL)
670 ppt->msix.arg[idx].pptdev = ppt;
671 ppt->msix.arg[idx].addr = addr;
672 ppt->msix.arg[idx].msg_data = msg;
674 /* Setup the MSI-X interrupt */
675 error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
676 INTR_TYPE_NET | INTR_MPSAFE,
677 pptintr, NULL, &ppt->msix.arg[idx],
678 &ppt->msix.cookie[idx]);
681 bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
682 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
683 ppt->msix.cookie[idx] = NULL;
684 ppt->msix.res[idx] = NULL;
688 /* Masked, tear it down if it's already been set up */
689 ppt_teardown_msix_intr(ppt, idx);