2 * Copyright (c) 2011 NetApp, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
40 #include <dev/pci/pcireg.h>
42 #include <machine/vmparam.h>
43 #include <contrib/dev/acpica/include/acpi.h>
48 * Documented in the "Intel Virtualization Technology for Directed I/O",
49 * Architecture Spec, September 2008.
52 /* Section 10.4 "Register Descriptions" */
54 volatile uint32_t version;
55 volatile uint32_t res0;
56 volatile uint64_t cap;
57 volatile uint64_t ext_cap;
58 volatile uint32_t gcr;
59 volatile uint32_t gsr;
60 volatile uint64_t rta;
61 volatile uint64_t ccr;
64 #define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
65 #define VTD_CAP_ND(cap) ((cap) & 0x7)
66 #define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
67 #define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
68 #define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
70 #define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
71 #define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
72 #define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
74 #define VTD_GCR_WBF (1 << 27)
75 #define VTD_GCR_SRTP (1 << 30)
76 #define VTD_GCR_TE (1U << 31)
78 #define VTD_GSR_WBFS (1 << 27)
79 #define VTD_GSR_RTPS (1 << 30)
80 #define VTD_GSR_TES (1U << 31)
82 #define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
83 #define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
85 #define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
86 #define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
87 #define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
88 #define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
89 #define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
90 #define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
91 #define VTD_IIR_DOMAIN_P 32
93 #define VTD_ROOT_PRESENT 0x1
94 #define VTD_CTX_PRESENT 0x1
95 #define VTD_CTX_TT_ALL (1UL << 2)
97 #define VTD_PTE_RD (1UL << 0)
98 #define VTD_PTE_WR (1UL << 1)
99 #define VTD_PTE_SUPERPAGE (1UL << 7)
100 #define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
103 uint64_t *ptp; /* first level page table page */
104 int pt_levels; /* number of page table levels */
105 int addrwidth; /* 'AW' field in context entry */
106 int spsmask; /* supported super page sizes */
107 u_int id; /* domain id */
108 vm_paddr_t maxaddr; /* highest address to be mapped */
109 SLIST_ENTRY(domain) next;
112 static SLIST_HEAD(, domain) domhead;
114 #define DRHD_MAX_UNITS 8
116 static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
117 static int max_domains;
118 typedef int (*drhd_ident_func_t)(void);
120 static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
121 static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
123 static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
126 vtd_max_domains(struct vtdmap *vtdmap)
130 nd = VTD_CAP_ND(vtdmap->cap);
148 panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
158 /* Skip domain id 0 - it is reserved when Caching Mode field is set */
159 for (id = 1; id < max_domains; id++) {
160 SLIST_FOREACH(dom, &domhead, next) {
165 break; /* found it */
168 if (id >= max_domains)
169 panic("domain ids exhausted");
175 vtd_wbflush(struct vtdmap *vtdmap)
178 if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
179 pmap_invalidate_cache();
181 if (VTD_CAP_RWBF(vtdmap->cap)) {
182 vtdmap->gcr = VTD_GCR_WBF;
183 while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
189 vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
192 vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
193 while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
198 vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
201 volatile uint64_t *iotlb_reg, val;
205 offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
206 iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
208 *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
209 VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
213 if ((val & VTD_IIR_IVT) == 0)
219 vtd_translation_enable(struct vtdmap *vtdmap)
222 vtdmap->gcr = VTD_GCR_TE;
223 while ((vtdmap->gsr & VTD_GSR_TES) == 0)
228 vtd_translation_disable(struct vtdmap *vtdmap)
232 while ((vtdmap->gsr & VTD_GSR_TES) != 0)
239 int i, units, remaining;
240 struct vtdmap *vtdmap;
241 vm_paddr_t ctx_paddr;
242 char *end, envname[32];
243 unsigned long mapaddr;
245 ACPI_TABLE_DMAR *dmar;
246 ACPI_DMAR_HEADER *hdr;
247 ACPI_DMAR_HARDWARE_UNIT *drhd;
250 * Allow the user to override the ACPI DMAR table by specifying the
251 * physical address of each remapping unit.
253 * The following example specifies two remapping units at
254 * physical addresses 0xfed90000 and 0xfeda0000 respectively.
255 * set vtd.regmap.0.addr=0xfed90000
256 * set vtd.regmap.1.addr=0xfeda0000
258 for (units = 0; units < DRHD_MAX_UNITS; units++) {
259 snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units);
260 if (getenv_ulong(envname, &mapaddr) == 0)
262 vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
268 /* Search for DMAR table. */
269 status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
270 if (ACPI_FAILURE(status))
273 end = (char *)dmar + dmar->Header.Length;
274 remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR);
275 while (remaining > sizeof(ACPI_DMAR_HEADER)) {
276 hdr = (ACPI_DMAR_HEADER *)(end - remaining);
277 if (hdr->Length > remaining)
280 * From Intel VT-d arch spec, version 1.3:
281 * BIOS implementations must report mapping structures
282 * in numerical order, i.e. All remapping structures of
283 * type 0 (DRHD) enumerated before remapping structures of
284 * type 1 (RMRR) and so forth.
286 if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
289 drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
290 vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
291 if (units >= DRHD_MAX_UNITS)
293 remaining -= hdr->Length;
303 if (VTD_CAP_CM(vtdmap->cap) != 0)
304 panic("vtd_init: invalid caching mode");
306 max_domains = vtd_max_domains(vtdmap);
309 * Set up the root-table to point to the context-entry tables
311 for (i = 0; i < 256; i++) {
312 ctx_paddr = vtophys(ctx_tables[i]);
313 if (ctx_paddr & PAGE_MASK)
314 panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
316 root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
331 struct vtdmap *vtdmap;
333 for (i = 0; i < drhd_num; i++) {
337 /* Update the root table address */
338 vtdmap->rta = vtophys(root_table);
339 vtdmap->gcr = VTD_GCR_SRTP;
340 while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
343 vtd_ctx_global_invalidate(vtdmap);
344 vtd_iotlb_global_invalidate(vtdmap);
346 vtd_translation_enable(vtdmap);
354 struct vtdmap *vtdmap;
356 for (i = 0; i < drhd_num; i++) {
358 vtd_translation_disable(vtdmap);
363 vtd_add_device(void *arg, int bus, int slot, int func)
367 struct domain *dom = arg;
369 struct vtdmap *vtdmap;
371 if (bus < 0 || bus > PCI_BUSMAX ||
372 slot < 0 || slot > PCI_SLOTMAX ||
373 func < 0 || func > PCI_FUNCMAX)
374 panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
377 ctxp = ctx_tables[bus];
378 pt_paddr = vtophys(dom->ptp);
379 idx = (slot << 3 | func) * 2;
381 if (ctxp[idx] & VTD_CTX_PRESENT) {
382 panic("vtd_add_device: device %d/%d/%d is already owned by "
383 "domain %d", bus, slot, func,
384 (uint16_t)(ctxp[idx + 1] >> 8));
388 * Order is important. The 'present' bit is set only after all fields
389 * of the context pointer are initialized.
391 ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
393 if (VTD_ECAP_DI(vtdmap->ext_cap))
394 ctxp[idx] = VTD_CTX_TT_ALL;
398 ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
401 * 'Not Present' entries are not cached in either the Context Cache
402 * or in the IOTLB, so there is no need to invalidate either of them.
407 vtd_remove_device(void *arg, int bus, int slot, int func)
411 struct vtdmap *vtdmap;
413 if (bus < 0 || bus > PCI_BUSMAX ||
414 slot < 0 || slot > PCI_SLOTMAX ||
415 func < 0 || func > PCI_FUNCMAX)
416 panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
418 ctxp = ctx_tables[bus];
419 idx = (slot << 3 | func) * 2;
422 * Order is important. The 'present' bit is must be cleared first.
428 * Invalidate the Context Cache and the IOTLB.
430 * XXX use device-selective invalidation for Context Cache
431 * XXX use domain-selective invalidation for IOTLB
433 for (i = 0; i < drhd_num; i++) {
435 vtd_ctx_global_invalidate(vtdmap);
436 vtd_iotlb_global_invalidate(vtdmap);
440 #define CREATE_MAPPING 0
441 #define REMOVE_MAPPING 1
444 vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
448 int i, spshift, ptpshift, ptpindex, nlevels;
449 uint64_t spsize, *ptp;
455 KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
457 KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
458 "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
461 panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
464 panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
467 panic("vtd_create_mapping: unaligned len 0x%0lx", len);
470 * Compute the size of the mapping that we can accomodate.
472 * This is based on three factors:
473 * - supported super page size
474 * - alignment of the region starting at 'gpa' and 'hpa'
475 * - length of the region 'len'
478 for (i = 3; i >= 0; i--) {
479 spsize = 1UL << spshift;
480 if ((dom->spsmask & (1 << i)) != 0 &&
481 (gpa & (spsize - 1)) == 0 &&
482 (hpa & (spsize - 1)) == 0 &&
490 nlevels = dom->pt_levels;
491 while (--nlevels >= 0) {
492 ptpshift = 12 + nlevels * 9;
493 ptpindex = (gpa >> ptpshift) & 0x1FF;
495 /* We have reached the leaf mapping */
496 if (spshift >= ptpshift) {
501 * We are working on a non-leaf page table page.
503 * Create a downstream page table page if necessary and point
504 * to it from the current page table.
506 if (ptp[ptpindex] == 0) {
507 void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
508 ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
511 ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
514 if ((gpa & ((1UL << ptpshift) - 1)) != 0)
515 panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
518 * Update the 'gpa' -> 'hpa' mapping
523 ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
526 ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
529 return (1UL << ptpshift);
533 vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
536 return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
540 vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
543 return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
547 vtd_invalidate_tlb(void *dom)
550 struct vtdmap *vtdmap;
553 * Invalidate the IOTLB.
554 * XXX use domain-selective invalidation for IOTLB
556 for (i = 0; i < drhd_num; i++) {
558 vtd_iotlb_global_invalidate(vtdmap);
563 vtd_create_domain(vm_paddr_t maxaddr)
567 int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
568 struct vtdmap *vtdmap;
571 panic("vtd_create_domain: no dma remapping hardware available");
577 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
580 for (gaw = 0; addr < maxaddr; gaw++)
583 res = (gaw - 12) % 9;
587 agaw = gaw + 9 - res;
593 * Select the smallest Supported AGAW and the corresponding number
594 * of page table levels.
599 tmp = VTD_CAP_SAGAW(vtdmap->cap);
600 for (i = 0; i < 5; i++) {
601 if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
611 panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
612 VTD_CAP_SAGAW(vtdmap->cap), agaw);
615 dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
616 dom->pt_levels = pt_levels;
617 dom->addrwidth = addrwidth;
618 dom->id = domain_id();
619 dom->maxaddr = maxaddr;
620 dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
621 if ((uintptr_t)dom->ptp & PAGE_MASK)
622 panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
626 * XXX superpage mappings for the iommu do not work correctly.
628 * By default all physical memory is mapped into the host_domain.
629 * When a VM is allocated wired memory the pages belonging to it
630 * are removed from the host_domain and added to the vm's domain.
632 * If the page being removed was mapped using a superpage mapping
633 * in the host_domain then we need to demote the mapping before
636 * There is not any code to deal with the demotion at the moment
637 * so we disable superpage mappings altogether.
639 dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
642 SLIST_INSERT_HEAD(&domhead, dom, next);
648 vtd_free_ptp(uint64_t *ptp, int level)
654 for (i = 0; i < 512; i++) {
655 if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
657 if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
659 nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
660 vtd_free_ptp(nlp, level - 1);
664 bzero(ptp, PAGE_SIZE);
669 vtd_destroy_domain(void *arg)
675 SLIST_REMOVE(&domhead, dom, domain, next);
676 vtd_free_ptp(dom->ptp, dom->pt_levels);
680 struct iommu_ops iommu_ops_intel = {