]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_passthru.c
bc: upgrade to version 3.3.4
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_passthru.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42
43 #include <dev/io/iodev.h>
44 #include <dev/pci/pcireg.h>
45
46 #include <machine/iodev.h>
47
48 #ifndef WITHOUT_CAPSICUM
49 #include <capsicum_helpers.h>
50 #endif
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <err.h>
55 #include <errno.h>
56 #include <fcntl.h>
57 #include <sysexits.h>
58 #include <unistd.h>
59
60 #include <machine/vmm.h>
61 #include <vmmapi.h>
62
63 #include "config.h"
64 #include "debug.h"
65 #include "pci_emul.h"
66 #include "mem.h"
67
68 #ifndef _PATH_DEVPCI
69 #define _PATH_DEVPCI    "/dev/pci"
70 #endif
71
72 #ifndef _PATH_DEVIO
73 #define _PATH_DEVIO     "/dev/io"
74 #endif
75
76 #ifndef _PATH_MEM
77 #define _PATH_MEM       "/dev/mem"
78 #endif
79
80 #define LEGACY_SUPPORT  1
81
82 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
83 #define MSIX_CAPLEN 12
84
85 static int pcifd = -1;
86 static int iofd = -1;
87 static int memfd = -1;
88
89 struct passthru_softc {
90         struct pci_devinst *psc_pi;
91         struct pcibar psc_bar[PCI_BARMAX + 1];
92         struct {
93                 int             capoff;
94                 int             msgctrl;
95                 int             emulated;
96         } psc_msi;
97         struct {
98                 int             capoff;
99         } psc_msix;
100         struct pcisel psc_sel;
101 };
102
103 static int
104 msi_caplen(int msgctrl)
105 {
106         int len;
107         
108         len = 10;               /* minimum length of msi capability */
109
110         if (msgctrl & PCIM_MSICTRL_64BIT)
111                 len += 4;
112
113 #if 0
114         /*
115          * Ignore the 'mask' and 'pending' bits in the MSI capability.
116          * We'll let the guest manipulate them directly.
117          */
118         if (msgctrl & PCIM_MSICTRL_VECTOR)
119                 len += 10;
120 #endif
121
122         return (len);
123 }
124
125 static uint32_t
126 read_config(const struct pcisel *sel, long reg, int width)
127 {
128         struct pci_io pi;
129
130         bzero(&pi, sizeof(pi));
131         pi.pi_sel = *sel;
132         pi.pi_reg = reg;
133         pi.pi_width = width;
134
135         if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
136                 return (0);                             /* XXX */
137         else
138                 return (pi.pi_data);
139 }
140
141 static void
142 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
143 {
144         struct pci_io pi;
145
146         bzero(&pi, sizeof(pi));
147         pi.pi_sel = *sel;
148         pi.pi_reg = reg;
149         pi.pi_width = width;
150         pi.pi_data = data;
151
152         (void)ioctl(pcifd, PCIOCWRITE, &pi);            /* XXX */
153 }
154
155 #ifdef LEGACY_SUPPORT
156 static int
157 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
158 {
159         int capoff, i;
160         struct msicap msicap;
161         u_char *capdata;
162
163         pci_populate_msicap(&msicap, msgnum, nextptr);
164
165         /*
166          * XXX
167          * Copy the msi capability structure in the last 16 bytes of the
168          * config space. This is wrong because it could shadow something
169          * useful to the device.
170          */
171         capoff = 256 - roundup(sizeof(msicap), 4);
172         capdata = (u_char *)&msicap;
173         for (i = 0; i < sizeof(msicap); i++)
174                 pci_set_cfgdata8(pi, capoff + i, capdata[i]);
175
176         return (capoff);
177 }
178 #endif  /* LEGACY_SUPPORT */
179
180 static int
181 cfginitmsi(struct passthru_softc *sc)
182 {
183         int i, ptr, capptr, cap, sts, caplen, table_size;
184         uint32_t u32;
185         struct pcisel sel;
186         struct pci_devinst *pi;
187         struct msixcap msixcap;
188         uint32_t *msixcap_ptr;
189
190         pi = sc->psc_pi;
191         sel = sc->psc_sel;
192
193         /*
194          * Parse the capabilities and cache the location of the MSI
195          * and MSI-X capabilities.
196          */
197         sts = read_config(&sel, PCIR_STATUS, 2);
198         if (sts & PCIM_STATUS_CAPPRESENT) {
199                 ptr = read_config(&sel, PCIR_CAP_PTR, 1);
200                 while (ptr != 0 && ptr != 0xff) {
201                         cap = read_config(&sel, ptr + PCICAP_ID, 1);
202                         if (cap == PCIY_MSI) {
203                                 /*
204                                  * Copy the MSI capability into the config
205                                  * space of the emulated pci device
206                                  */
207                                 sc->psc_msi.capoff = ptr;
208                                 sc->psc_msi.msgctrl = read_config(&sel,
209                                                                   ptr + 2, 2);
210                                 sc->psc_msi.emulated = 0;
211                                 caplen = msi_caplen(sc->psc_msi.msgctrl);
212                                 capptr = ptr;
213                                 while (caplen > 0) {
214                                         u32 = read_config(&sel, capptr, 4);
215                                         pci_set_cfgdata32(pi, capptr, u32);
216                                         caplen -= 4;
217                                         capptr += 4;
218                                 }
219                         } else if (cap == PCIY_MSIX) {
220                                 /*
221                                  * Copy the MSI-X capability 
222                                  */
223                                 sc->psc_msix.capoff = ptr;
224                                 caplen = 12;
225                                 msixcap_ptr = (uint32_t*) &msixcap;
226                                 capptr = ptr;
227                                 while (caplen > 0) {
228                                         u32 = read_config(&sel, capptr, 4);
229                                         *msixcap_ptr = u32;
230                                         pci_set_cfgdata32(pi, capptr, u32);
231                                         caplen -= 4;
232                                         capptr += 4;
233                                         msixcap_ptr++;
234                                 }
235                         }
236                         ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
237                 }
238         }
239
240         if (sc->psc_msix.capoff != 0) {
241                 pi->pi_msix.pba_bar =
242                     msixcap.pba_info & PCIM_MSIX_BIR_MASK;
243                 pi->pi_msix.pba_offset =
244                     msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
245                 pi->pi_msix.table_bar =
246                     msixcap.table_info & PCIM_MSIX_BIR_MASK;
247                 pi->pi_msix.table_offset =
248                     msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
249                 pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
250                 pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
251
252                 /* Allocate the emulated MSI-X table array */
253                 table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
254                 pi->pi_msix.table = calloc(1, table_size);
255
256                 /* Mask all table entries */
257                 for (i = 0; i < pi->pi_msix.table_count; i++) {
258                         pi->pi_msix.table[i].vector_control |=
259                                                 PCIM_MSIX_VCTRL_MASK;
260                 }
261         }
262
263 #ifdef LEGACY_SUPPORT
264         /*
265          * If the passthrough device does not support MSI then craft a
266          * MSI capability for it. We link the new MSI capability at the
267          * head of the list of capabilities.
268          */
269         if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
270                 int origptr, msiptr;
271                 origptr = read_config(&sel, PCIR_CAP_PTR, 1);
272                 msiptr = passthru_add_msicap(pi, 1, origptr);
273                 sc->psc_msi.capoff = msiptr;
274                 sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
275                 sc->psc_msi.emulated = 1;
276                 pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
277         }
278 #endif
279
280         /* Make sure one of the capabilities is present */
281         if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) 
282                 return (-1);
283         else
284                 return (0);
285 }
286
287 static uint64_t
288 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
289 {
290         struct pci_devinst *pi;
291         struct msix_table_entry *entry;
292         uint8_t *src8;
293         uint16_t *src16;
294         uint32_t *src32;
295         uint64_t *src64;
296         uint64_t data;
297         size_t entry_offset;
298         int index;
299
300         pi = sc->psc_pi;
301         if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset &&
302             offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
303                 switch(size) {
304                 case 1:
305                         src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
306                             pi->pi_msix.pba_page_offset);
307                         data = *src8;
308                         break;
309                 case 2:
310                         src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
311                             pi->pi_msix.pba_page_offset);
312                         data = *src16;
313                         break;
314                 case 4:
315                         src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
316                             pi->pi_msix.pba_page_offset);
317                         data = *src32;
318                         break;
319                 case 8:
320                         src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
321                             pi->pi_msix.pba_page_offset);
322                         data = *src64;
323                         break;
324                 default:
325                         return (-1);
326                 }
327                 return (data);
328         }
329
330         if (offset < pi->pi_msix.table_offset)
331                 return (-1);
332
333         offset -= pi->pi_msix.table_offset;
334         index = offset / MSIX_TABLE_ENTRY_SIZE;
335         if (index >= pi->pi_msix.table_count)
336                 return (-1);
337
338         entry = &pi->pi_msix.table[index];
339         entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
340
341         switch(size) {
342         case 1:
343                 src8 = (uint8_t *)((void *)entry + entry_offset);
344                 data = *src8;
345                 break;
346         case 2:
347                 src16 = (uint16_t *)((void *)entry + entry_offset);
348                 data = *src16;
349                 break;
350         case 4:
351                 src32 = (uint32_t *)((void *)entry + entry_offset);
352                 data = *src32;
353                 break;
354         case 8:
355                 src64 = (uint64_t *)((void *)entry + entry_offset);
356                 data = *src64;
357                 break;
358         default:
359                 return (-1);
360         }
361
362         return (data);
363 }
364
365 static void
366 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
367                  uint64_t offset, int size, uint64_t data)
368 {
369         struct pci_devinst *pi;
370         struct msix_table_entry *entry;
371         uint8_t *dest8;
372         uint16_t *dest16;
373         uint32_t *dest32;
374         uint64_t *dest64;
375         size_t entry_offset;
376         uint32_t vector_control;
377         int index;
378
379         pi = sc->psc_pi;
380         if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset &&
381             offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
382                 switch(size) {
383                 case 1:
384                         dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
385                             pi->pi_msix.pba_page_offset);
386                         *dest8 = data;
387                         break;
388                 case 2:
389                         dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
390                             pi->pi_msix.pba_page_offset);
391                         *dest16 = data;
392                         break;
393                 case 4:
394                         dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
395                             pi->pi_msix.pba_page_offset);
396                         *dest32 = data;
397                         break;
398                 case 8:
399                         dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
400                             pi->pi_msix.pba_page_offset);
401                         *dest64 = data;
402                         break;
403                 default:
404                         break;
405                 }
406                 return;
407         }
408
409         if (offset < pi->pi_msix.table_offset)
410                 return;
411
412         offset -= pi->pi_msix.table_offset;
413         index = offset / MSIX_TABLE_ENTRY_SIZE;
414         if (index >= pi->pi_msix.table_count)
415                 return;
416
417         entry = &pi->pi_msix.table[index];
418         entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
419
420         /* Only 4 byte naturally-aligned writes are supported */
421         assert(size == 4);
422         assert(entry_offset % 4 == 0);
423
424         vector_control = entry->vector_control;
425         dest32 = (uint32_t *)((void *)entry + entry_offset);
426         *dest32 = data;
427         /* If MSI-X hasn't been enabled, do nothing */
428         if (pi->pi_msix.enabled) {
429                 /* If the entry is masked, don't set it up */
430                 if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
431                     (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
432                         (void)vm_setup_pptdev_msix(ctx, vcpu,
433                             sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
434                             sc->psc_sel.pc_func, index, entry->addr,
435                             entry->msg_data, entry->vector_control);
436                 }
437         }
438 }
439
440 static int
441 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
442 {
443         int b, s, f;
444         int idx;
445         size_t remaining;
446         uint32_t table_size, table_offset;
447         uint32_t pba_size, pba_offset;
448         vm_paddr_t start;
449         struct pci_devinst *pi = sc->psc_pi;
450
451         assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
452
453         b = sc->psc_sel.pc_bus;
454         s = sc->psc_sel.pc_dev;
455         f = sc->psc_sel.pc_func;
456
457         /* 
458          * If the MSI-X table BAR maps memory intended for
459          * other uses, it is at least assured that the table 
460          * either resides in its own page within the region, 
461          * or it resides in a page shared with only the PBA.
462          */
463         table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
464
465         table_size = pi->pi_msix.table_offset - table_offset;
466         table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
467         table_size = roundup2(table_size, 4096);
468
469         idx = pi->pi_msix.table_bar;
470         start = pi->pi_bar[idx].addr;
471         remaining = pi->pi_bar[idx].size;
472
473         if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
474                 pba_offset = pi->pi_msix.pba_offset;
475                 pba_size = pi->pi_msix.pba_size;
476                 if (pba_offset >= table_offset + table_size ||
477                     table_offset >= pba_offset + pba_size) {
478                         /*
479                          * If the PBA does not share a page with the MSI-x
480                          * tables, no PBA emulation is required.
481                          */
482                         pi->pi_msix.pba_page = NULL;
483                         pi->pi_msix.pba_page_offset = 0;
484                 } else {
485                         /*
486                          * The PBA overlaps with either the first or last
487                          * page of the MSI-X table region.  Map the
488                          * appropriate page.
489                          */
490                         if (pba_offset <= table_offset)
491                                 pi->pi_msix.pba_page_offset = table_offset;
492                         else
493                                 pi->pi_msix.pba_page_offset = table_offset +
494                                     table_size - 4096;
495                         pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
496                             PROT_WRITE, MAP_SHARED, memfd, start +
497                             pi->pi_msix.pba_page_offset);
498                         if (pi->pi_msix.pba_page == MAP_FAILED) {
499                                 warn(
500                             "Failed to map PBA page for MSI-X on %d/%d/%d",
501                                     b, s, f);
502                                 return (-1);
503                         }
504                 }
505         }
506
507         return (0);
508 }
509
510 static int
511 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
512 {
513         int i, error;
514         struct pci_devinst *pi;
515         struct pci_bar_io bar;
516         enum pcibar_type bartype;
517         uint64_t base, size;
518
519         pi = sc->psc_pi;
520
521         /*
522          * Initialize BAR registers
523          */
524         for (i = 0; i <= PCI_BARMAX; i++) {
525                 bzero(&bar, sizeof(bar));
526                 bar.pbi_sel = sc->psc_sel;
527                 bar.pbi_reg = PCIR_BAR(i);
528
529                 if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
530                         continue;
531
532                 if (PCI_BAR_IO(bar.pbi_base)) {
533                         bartype = PCIBAR_IO;
534                         base = bar.pbi_base & PCIM_BAR_IO_BASE;
535                 } else {
536                         switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
537                         case PCIM_BAR_MEM_64:
538                                 bartype = PCIBAR_MEM64;
539                                 break;
540                         default:
541                                 bartype = PCIBAR_MEM32;
542                                 break;
543                         }
544                         base = bar.pbi_base & PCIM_BAR_MEM_BASE;
545                 }
546                 size = bar.pbi_length;
547
548                 if (bartype != PCIBAR_IO) {
549                         if (((base | size) & PAGE_MASK) != 0) {
550                                 warnx("passthru device %d/%d/%d BAR %d: "
551                                     "base %#lx or size %#lx not page aligned\n",
552                                     sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
553                                     sc->psc_sel.pc_func, i, base, size);
554                                 return (-1);
555                         }
556                 }
557
558                 /* Cache information about the "real" BAR */
559                 sc->psc_bar[i].type = bartype;
560                 sc->psc_bar[i].size = size;
561                 sc->psc_bar[i].addr = base;
562
563                 /* Allocate the BAR in the guest I/O or MMIO space */
564                 error = pci_emul_alloc_bar(pi, i, bartype, size);
565                 if (error)
566                         return (-1);
567
568                 /* The MSI-X table needs special handling */
569                 if (i == pci_msix_table_bar(pi)) {
570                         error = init_msix_table(ctx, sc, base);
571                         if (error) 
572                                 return (-1);
573                 }
574
575                 /*
576                  * 64-bit BAR takes up two slots so skip the next one.
577                  */
578                 if (bartype == PCIBAR_MEM64) {
579                         i++;
580                         assert(i <= PCI_BARMAX);
581                         sc->psc_bar[i].type = PCIBAR_MEMHI64;
582                 }
583         }
584         return (0);
585 }
586
587 static int
588 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
589 {
590         int error;
591         struct passthru_softc *sc;
592
593         error = 1;
594         sc = pi->pi_arg;
595
596         bzero(&sc->psc_sel, sizeof(struct pcisel));
597         sc->psc_sel.pc_bus = bus;
598         sc->psc_sel.pc_dev = slot;
599         sc->psc_sel.pc_func = func;
600
601         if (cfginitmsi(sc) != 0) {
602                 warnx("failed to initialize MSI for PCI %d/%d/%d",
603                     bus, slot, func);
604                 goto done;
605         }
606
607         if (cfginitbar(ctx, sc) != 0) {
608                 warnx("failed to initialize BARs for PCI %d/%d/%d",
609                     bus, slot, func);
610                 goto done;
611         }
612
613         pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(&sc->psc_sel,
614             PCIR_COMMAND, 2));
615
616         error = 0;                              /* success */
617 done:
618         return (error);
619 }
620
621 static int
622 passthru_legacy_config(nvlist_t *nvl, const char *opts)
623 {
624         char value[16];
625         int bus, slot, func;
626
627         if (opts == NULL)
628                 return (0);
629
630         if (sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
631                 EPRINTLN("passthru: invalid options \"%s\"", opts);
632                 return (-1);
633         }
634
635         snprintf(value, sizeof(value), "%d", bus);
636         set_config_value_node(nvl, "bus", value);
637         snprintf(value, sizeof(value), "%d", slot);
638         set_config_value_node(nvl, "slot", value);
639         snprintf(value, sizeof(value), "%d", func);
640         set_config_value_node(nvl, "func", value);
641         return (0);
642 }
643
644 static int
645 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
646 {
647         int bus, slot, func, error, memflags;
648         struct passthru_softc *sc;
649         const char *value;
650 #ifndef WITHOUT_CAPSICUM
651         cap_rights_t rights;
652         cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR };
653         cap_ioctl_t io_ioctls[] = { IODEV_PIO };
654 #endif
655
656         sc = NULL;
657         error = 1;
658
659 #ifndef WITHOUT_CAPSICUM
660         cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
661 #endif
662
663         memflags = vm_get_memflags(ctx);
664         if (!(memflags & VM_MEM_F_WIRED)) {
665                 warnx("passthru requires guest memory to be wired");
666                 return (error);
667         }
668
669         if (pcifd < 0) {
670                 pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
671                 if (pcifd < 0) {
672                         warn("failed to open %s", _PATH_DEVPCI);
673                         return (error);
674                 }
675         }
676
677 #ifndef WITHOUT_CAPSICUM
678         if (caph_rights_limit(pcifd, &rights) == -1)
679                 errx(EX_OSERR, "Unable to apply rights for sandbox");
680         if (caph_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1)
681                 errx(EX_OSERR, "Unable to apply rights for sandbox");
682 #endif
683
684         if (iofd < 0) {
685                 iofd = open(_PATH_DEVIO, O_RDWR, 0);
686                 if (iofd < 0) {
687                         warn("failed to open %s", _PATH_DEVIO);
688                         return (error);
689                 }
690         }
691
692 #ifndef WITHOUT_CAPSICUM
693         if (caph_rights_limit(iofd, &rights) == -1)
694                 errx(EX_OSERR, "Unable to apply rights for sandbox");
695         if (caph_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1)
696                 errx(EX_OSERR, "Unable to apply rights for sandbox");
697 #endif
698
699         if (memfd < 0) {
700                 memfd = open(_PATH_MEM, O_RDWR, 0);
701                 if (memfd < 0) {
702                         warn("failed to open %s", _PATH_MEM);
703                         return (error);
704                 }
705         }
706
707 #ifndef WITHOUT_CAPSICUM
708         cap_rights_clear(&rights, CAP_IOCTL);
709         cap_rights_set(&rights, CAP_MMAP_RW);
710         if (caph_rights_limit(memfd, &rights) == -1)
711                 errx(EX_OSERR, "Unable to apply rights for sandbox");
712 #endif
713
714 #define GET_INT_CONFIG(var, name) do {                                  \
715         value = get_config_value_node(nvl, name);                       \
716         if (value == NULL) {                                            \
717                 EPRINTLN("passthru: missing required %s setting", name); \
718                 return (error);                                         \
719         }                                                               \
720         var = atoi(value);                                              \
721 } while (0)
722
723         GET_INT_CONFIG(bus, "bus");
724         GET_INT_CONFIG(slot, "slot");
725         GET_INT_CONFIG(func, "func");
726
727         if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
728                 warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
729                     bus, slot, func);
730                 goto done;
731         }
732
733         sc = calloc(1, sizeof(struct passthru_softc));
734
735         pi->pi_arg = sc;
736         sc->psc_pi = pi;
737
738         /* initialize config space */
739         error = cfginit(ctx, pi, bus, slot, func);
740 done:
741         if (error) {
742                 free(sc);
743                 vm_unassign_pptdev(ctx, bus, slot, func);
744         }
745         return (error);
746 }
747
748 static int
749 bar_access(int coff)
750 {
751         if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
752                 return (1);
753         else
754                 return (0);
755 }
756
757 static int
758 msicap_access(struct passthru_softc *sc, int coff)
759 {
760         int caplen;
761
762         if (sc->psc_msi.capoff == 0)
763                 return (0);
764
765         caplen = msi_caplen(sc->psc_msi.msgctrl);
766
767         if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
768                 return (1);
769         else
770                 return (0);
771 }
772
773 static int 
774 msixcap_access(struct passthru_softc *sc, int coff)
775 {
776         if (sc->psc_msix.capoff == 0) 
777                 return (0);
778
779         return (coff >= sc->psc_msix.capoff && 
780                 coff < sc->psc_msix.capoff + MSIX_CAPLEN);
781 }
782
783 static int
784 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
785                  int coff, int bytes, uint32_t *rv)
786 {
787         struct passthru_softc *sc;
788
789         sc = pi->pi_arg;
790
791         /*
792          * PCI BARs and MSI capability is emulated.
793          */
794         if (bar_access(coff) || msicap_access(sc, coff))
795                 return (-1);
796
797 #ifdef LEGACY_SUPPORT
798         /*
799          * Emulate PCIR_CAP_PTR if this device does not support MSI capability
800          * natively.
801          */
802         if (sc->psc_msi.emulated) {
803                 if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
804                         return (-1);
805         }
806 #endif
807
808         /*
809          * Emulate the command register.  If a single read reads both the
810          * command and status registers, read the status register from the
811          * device's config space.
812          */
813         if (coff == PCIR_COMMAND) {
814                 if (bytes <= 2)
815                         return (-1);
816                 *rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 |
817                     pci_get_cfgdata16(pi, PCIR_COMMAND);
818                 return (0);
819         }
820
821         /* Everything else just read from the device's config space */
822         *rv = read_config(&sc->psc_sel, coff, bytes);
823
824         return (0);
825 }
826
827 static int
828 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
829                   int coff, int bytes, uint32_t val)
830 {
831         int error, msix_table_entries, i;
832         struct passthru_softc *sc;
833         uint16_t cmd_old;
834
835         sc = pi->pi_arg;
836
837         /*
838          * PCI BARs are emulated
839          */
840         if (bar_access(coff))
841                 return (-1);
842
843         /*
844          * MSI capability is emulated
845          */
846         if (msicap_access(sc, coff)) {
847                 pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
848                     PCIY_MSI);
849                 error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
850                         sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
851                         pi->pi_msi.addr, pi->pi_msi.msg_data,
852                         pi->pi_msi.maxmsgnum);
853                 if (error != 0)
854                         err(1, "vm_setup_pptdev_msi");
855                 return (0);
856         }
857
858         if (msixcap_access(sc, coff)) {
859                 pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
860                     PCIY_MSIX);
861                 if (pi->pi_msix.enabled) {
862                         msix_table_entries = pi->pi_msix.table_count;
863                         for (i = 0; i < msix_table_entries; i++) {
864                                 error = vm_setup_pptdev_msix(ctx, vcpu,
865                                     sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, 
866                                     sc->psc_sel.pc_func, i, 
867                                     pi->pi_msix.table[i].addr,
868                                     pi->pi_msix.table[i].msg_data,
869                                     pi->pi_msix.table[i].vector_control);
870                 
871                                 if (error)
872                                         err(1, "vm_setup_pptdev_msix");
873                         }
874                 } else {
875                         error = vm_disable_pptdev_msix(ctx, sc->psc_sel.pc_bus,
876                             sc->psc_sel.pc_dev, sc->psc_sel.pc_func);
877                         if (error)
878                                 err(1, "vm_disable_pptdev_msix");
879                 }
880                 return (0);
881         }
882
883 #ifdef LEGACY_SUPPORT
884         /*
885          * If this device does not support MSI natively then we cannot let
886          * the guest disable legacy interrupts from the device. It is the
887          * legacy interrupt that is triggering the virtual MSI to the guest.
888          */
889         if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
890                 if (coff == PCIR_COMMAND && bytes == 2)
891                         val &= ~PCIM_CMD_INTxDIS;
892         }
893 #endif
894
895         write_config(&sc->psc_sel, coff, bytes, val);
896         if (coff == PCIR_COMMAND) {
897                 cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
898                 if (bytes == 1)
899                         pci_set_cfgdata8(pi, PCIR_COMMAND, val);
900                 else if (bytes == 2)
901                         pci_set_cfgdata16(pi, PCIR_COMMAND, val);
902                 pci_emul_cmd_changed(pi, cmd_old);
903         }
904
905         return (0);
906 }
907
908 static void
909 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
910                uint64_t offset, int size, uint64_t value)
911 {
912         struct passthru_softc *sc;
913         struct iodev_pio_req pio;
914
915         sc = pi->pi_arg;
916
917         if (baridx == pci_msix_table_bar(pi)) {
918                 msix_table_write(ctx, vcpu, sc, offset, size, value);
919         } else {
920                 assert(pi->pi_bar[baridx].type == PCIBAR_IO);
921                 bzero(&pio, sizeof(struct iodev_pio_req));
922                 pio.access = IODEV_PIO_WRITE;
923                 pio.port = sc->psc_bar[baridx].addr + offset;
924                 pio.width = size;
925                 pio.val = value;
926                 
927                 (void)ioctl(iofd, IODEV_PIO, &pio);
928         }
929 }
930
931 static uint64_t
932 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
933               uint64_t offset, int size)
934 {
935         struct passthru_softc *sc;
936         struct iodev_pio_req pio;
937         uint64_t val;
938
939         sc = pi->pi_arg;
940
941         if (baridx == pci_msix_table_bar(pi)) {
942                 val = msix_table_read(sc, offset, size);
943         } else {
944                 assert(pi->pi_bar[baridx].type == PCIBAR_IO);
945                 bzero(&pio, sizeof(struct iodev_pio_req));
946                 pio.access = IODEV_PIO_READ;
947                 pio.port = sc->psc_bar[baridx].addr + offset;
948                 pio.width = size;
949                 pio.val = 0;
950
951                 (void)ioctl(iofd, IODEV_PIO, &pio);
952
953                 val = pio.val;
954         }
955
956         return (val);
957 }
958
959 static void
960 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
961                    int enabled, uint64_t address)
962 {
963         struct passthru_softc *sc;
964         size_t remaining;
965         uint32_t table_size, table_offset;
966
967         sc = pi->pi_arg;
968         table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
969         if (table_offset > 0) {
970                 if (!enabled) {
971                         if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
972                                                  sc->psc_sel.pc_dev,
973                                                  sc->psc_sel.pc_func, address,
974                                                  table_offset) != 0)
975                                 warnx("pci_passthru: unmap_pptdev_mmio failed");
976                 } else {
977                         if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
978                                                sc->psc_sel.pc_dev,
979                                                sc->psc_sel.pc_func, address,
980                                                table_offset,
981                                                sc->psc_bar[baridx].addr) != 0)
982                                 warnx("pci_passthru: map_pptdev_mmio failed");
983                 }
984         }
985         table_size = pi->pi_msix.table_offset - table_offset;
986         table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
987         table_size = roundup2(table_size, 4096);
988         remaining = pi->pi_bar[baridx].size - table_offset - table_size;
989         if (remaining > 0) {
990                 address += table_offset + table_size;
991                 if (!enabled) {
992                         if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
993                                                  sc->psc_sel.pc_dev,
994                                                  sc->psc_sel.pc_func, address,
995                                                  remaining) != 0)
996                                 warnx("pci_passthru: unmap_pptdev_mmio failed");
997                 } else {
998                         if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
999                                                sc->psc_sel.pc_dev,
1000                                                sc->psc_sel.pc_func, address,
1001                                                remaining,
1002                                                sc->psc_bar[baridx].addr +
1003                                                table_offset + table_size) != 0)
1004                                 warnx("pci_passthru: map_pptdev_mmio failed");
1005                 }
1006         }
1007 }
1008
1009 static void
1010 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1011                    int enabled, uint64_t address)
1012 {
1013         struct passthru_softc *sc;
1014
1015         sc = pi->pi_arg;
1016         if (!enabled) {
1017                 if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
1018                                          sc->psc_sel.pc_dev,
1019                                          sc->psc_sel.pc_func, address,
1020                                          sc->psc_bar[baridx].size) != 0)
1021                         warnx("pci_passthru: unmap_pptdev_mmio failed");
1022         } else {
1023                 if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
1024                                        sc->psc_sel.pc_dev,
1025                                        sc->psc_sel.pc_func, address,
1026                                        sc->psc_bar[baridx].size,
1027                                        sc->psc_bar[baridx].addr) != 0)
1028                         warnx("pci_passthru: map_pptdev_mmio failed");
1029         }
1030 }
1031
1032 static void
1033 passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1034               int enabled, uint64_t address)
1035 {
1036
1037         if (pi->pi_bar[baridx].type == PCIBAR_IO)
1038                 return;
1039         if (baridx == pci_msix_table_bar(pi))
1040                 passthru_msix_addr(ctx, pi, baridx, enabled, address);
1041         else
1042                 passthru_mmio_addr(ctx, pi, baridx, enabled, address);
1043 }
1044
1045 struct pci_devemu passthru = {
1046         .pe_emu         = "passthru",
1047         .pe_init        = passthru_init,
1048         .pe_legacy_config = passthru_legacy_config,
1049         .pe_cfgwrite    = passthru_cfgwrite,
1050         .pe_cfgread     = passthru_cfgread,
1051         .pe_barwrite    = passthru_write,
1052         .pe_barread     = passthru_read,
1053         .pe_baraddr     = passthru_addr,
1054 };
1055 PCI_EMUL_SET(passthru);