2 * Copyright (c) 2013 The FreeBSD Foundation
5 * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
6 * under sponsorship from the FreeBSD Foundation.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
35 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/memdesc.h>
39 #include <sys/mutex.h>
41 #include <sys/queue.h>
43 #include <sys/rwlock.h>
44 #include <sys/sched.h>
45 #include <sys/sf_buf.h>
46 #include <sys/sysctl.h>
47 #include <sys/systm.h>
48 #include <sys/taskqueue.h>
52 #include <dev/pci/pcivar.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_object.h>
57 #include <vm/vm_page.h>
58 #include <vm/vm_map.h>
59 #include <vm/vm_pageout.h>
60 #include <machine/bus.h>
61 #include <machine/cpu.h>
62 #include <machine/intr_machdep.h>
63 #include <x86/include/apicvar.h>
64 #include <x86/include/busdma_impl.h>
65 #include <x86/iommu/intel_reg.h>
66 #include <x86/iommu/busdma_dmar.h>
67 #include <x86/iommu/intel_dmar.h>
70 dmar_nd2mask(u_int nd)
72 static const u_int masks[] = {
80 0x0000, /* nd == 7 reserved */
83 KASSERT(nd <= 6, ("number of domains %d", nd));
87 static const struct sagaw_bits_tag {
93 {.agaw = 30, .cap = DMAR_CAP_SAGAW_2LVL, .awlvl = DMAR_CTX2_AW_2LVL,
95 {.agaw = 39, .cap = DMAR_CAP_SAGAW_3LVL, .awlvl = DMAR_CTX2_AW_3LVL,
97 {.agaw = 48, .cap = DMAR_CAP_SAGAW_4LVL, .awlvl = DMAR_CTX2_AW_4LVL,
99 {.agaw = 57, .cap = DMAR_CAP_SAGAW_5LVL, .awlvl = DMAR_CTX2_AW_5LVL,
101 {.agaw = 64, .cap = DMAR_CAP_SAGAW_6LVL, .awlvl = DMAR_CTX2_AW_6LVL,
106 dmar_pglvl_supported(struct dmar_unit *unit, int pglvl)
110 for (i = 0; i < nitems(sagaw_bits); i++) {
111 if (sagaw_bits[i].pglvl != pglvl)
113 if ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
120 domain_set_agaw(struct dmar_domain *domain, int mgaw)
125 sagaw = DMAR_CAP_SAGAW(domain->dmar->hw_cap);
126 for (i = 0; i < nitems(sagaw_bits); i++) {
127 if (sagaw_bits[i].agaw >= mgaw) {
128 domain->agaw = sagaw_bits[i].agaw;
129 domain->pglvl = sagaw_bits[i].pglvl;
130 domain->awlvl = sagaw_bits[i].awlvl;
134 device_printf(domain->dmar->dev,
135 "context request mgaw %d: no agaw found, sagaw %x\n",
141 * Find a best fit mgaw for the given maxaddr:
142 * - if allow_less is false, must find sagaw which maps all requested
143 * addresses (used by identity mappings);
144 * - if allow_less is true, and no supported sagaw can map all requested
145 * address space, accept the biggest sagaw, whatever is it.
148 dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr, bool allow_less)
152 for (i = 0; i < nitems(sagaw_bits); i++) {
153 if ((1ULL << sagaw_bits[i].agaw) >= maxaddr &&
154 (DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
157 if (allow_less && i == nitems(sagaw_bits)) {
160 } while ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap)
163 if (i < nitems(sagaw_bits))
164 return (sagaw_bits[i].agaw);
165 KASSERT(0, ("no mgaw for maxaddr %jx allow_less %d",
166 (uintmax_t) maxaddr, allow_less));
171 * Calculate the total amount of page table pages needed to map the
172 * whole bus address space on the context with the selected agaw.
175 pglvl_max_pages(int pglvl)
180 for (res = 0, i = pglvl; i > 0; i--) {
188 * Return true if the page table level lvl supports the superpage for
192 domain_is_sp_lvl(struct dmar_domain *domain, int lvl)
195 static const int sagaw_sp[] = {
202 alvl = domain->pglvl - lvl - 1;
203 cap_sps = DMAR_CAP_SPS(domain->dmar->hw_cap);
204 return (alvl < nitems(sagaw_sp) && (sagaw_sp[alvl] & cap_sps) != 0);
208 pglvl_page_size(int total_pglvl, int lvl)
211 static const dmar_gaddr_t pg_sz[] = {
212 (dmar_gaddr_t)DMAR_PAGE_SIZE,
213 (dmar_gaddr_t)DMAR_PAGE_SIZE << DMAR_NPTEPGSHIFT,
214 (dmar_gaddr_t)DMAR_PAGE_SIZE << (2 * DMAR_NPTEPGSHIFT),
215 (dmar_gaddr_t)DMAR_PAGE_SIZE << (3 * DMAR_NPTEPGSHIFT),
216 (dmar_gaddr_t)DMAR_PAGE_SIZE << (4 * DMAR_NPTEPGSHIFT),
217 (dmar_gaddr_t)DMAR_PAGE_SIZE << (5 * DMAR_NPTEPGSHIFT)
220 KASSERT(lvl >= 0 && lvl < total_pglvl,
221 ("total %d lvl %d", total_pglvl, lvl));
222 rlvl = total_pglvl - lvl - 1;
223 KASSERT(rlvl < nitems(pg_sz), ("sizeof pg_sz lvl %d", lvl));
224 return (pg_sz[rlvl]);
228 domain_page_size(struct dmar_domain *domain, int lvl)
231 return (pglvl_page_size(domain->pglvl, lvl));
235 calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size,
236 dmar_gaddr_t *isizep)
241 for (am = DMAR_CAP_MAMV(unit->hw_cap);; am--) {
242 isize = 1ULL << (am + DMAR_PAGE_SHIFT);
243 if ((base & (isize - 1)) == 0 && size >= isize)
252 dmar_haddr_t dmar_high;
254 int dmar_tbl_pagecnt;
257 dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags)
262 zeroed = (flags & DMAR_PGF_ZERO) != 0 ? VM_ALLOC_ZERO : 0;
264 if ((flags & DMAR_PGF_OBJL) == 0)
265 VM_OBJECT_WLOCK(obj);
266 m = vm_page_lookup(obj, idx);
267 if ((flags & DMAR_PGF_NOALLOC) != 0 || m != NULL) {
268 if ((flags & DMAR_PGF_OBJL) == 0)
269 VM_OBJECT_WUNLOCK(obj);
272 m = vm_page_alloc_contig(obj, idx, VM_ALLOC_NOBUSY |
273 VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | zeroed, 1, 0,
274 dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
275 if ((flags & DMAR_PGF_OBJL) == 0)
276 VM_OBJECT_WUNLOCK(obj);
278 if (zeroed && (m->flags & PG_ZERO) == 0)
280 atomic_add_int(&dmar_tbl_pagecnt, 1);
283 if ((flags & DMAR_PGF_WAITOK) == 0)
285 if ((flags & DMAR_PGF_OBJL) != 0)
286 VM_OBJECT_WUNLOCK(obj);
288 if ((flags & DMAR_PGF_OBJL) != 0)
289 VM_OBJECT_WLOCK(obj);
295 dmar_pgfree(vm_object_t obj, vm_pindex_t idx, int flags)
299 if ((flags & DMAR_PGF_OBJL) == 0)
300 VM_OBJECT_WLOCK(obj);
301 m = vm_page_lookup(obj, idx);
304 atomic_subtract_int(&dmar_tbl_pagecnt, 1);
306 if ((flags & DMAR_PGF_OBJL) == 0)
307 VM_OBJECT_WUNLOCK(obj);
311 dmar_map_pgtbl(vm_object_t obj, vm_pindex_t idx, int flags,
317 if ((flags & DMAR_PGF_OBJL) == 0)
318 VM_OBJECT_WLOCK(obj);
319 m = vm_page_lookup(obj, idx);
320 if (m == NULL && (flags & DMAR_PGF_ALLOC) != 0) {
321 m = dmar_pgalloc(obj, idx, flags | DMAR_PGF_OBJL);
326 if ((flags & DMAR_PGF_OBJL) == 0)
327 VM_OBJECT_WUNLOCK(obj);
330 /* Sleepable allocations cannot fail. */
331 if ((flags & DMAR_PGF_WAITOK) != 0)
332 VM_OBJECT_WUNLOCK(obj);
334 *sf = sf_buf_alloc(m, SFB_CPUPRIVATE | ((flags & DMAR_PGF_WAITOK)
335 == 0 ? SFB_NOWAIT : 0));
339 VM_OBJECT_ASSERT_WLOCKED(obj);
340 dmar_pgfree(obj, m->pindex, flags | DMAR_PGF_OBJL);
342 if ((flags & DMAR_PGF_OBJL) == 0)
343 VM_OBJECT_WUNLOCK(obj);
346 if ((flags & (DMAR_PGF_WAITOK | DMAR_PGF_OBJL)) ==
347 (DMAR_PGF_WAITOK | DMAR_PGF_OBJL))
348 VM_OBJECT_WLOCK(obj);
349 else if ((flags & (DMAR_PGF_WAITOK | DMAR_PGF_OBJL)) == 0)
350 VM_OBJECT_WUNLOCK(obj);
351 return ((void *)sf_buf_kva(*sf));
355 dmar_unmap_pgtbl(struct sf_buf *sf)
363 dmar_flush_transl_to_ram(struct dmar_unit *unit, void *dst, size_t sz)
366 if (DMAR_IS_COHERENT(unit))
369 * If DMAR does not snoop paging structures accesses, flush
370 * CPU cache to memory.
372 pmap_invalidate_cache_range((uintptr_t)dst, (uintptr_t)dst + sz,
377 dmar_flush_pte_to_ram(struct dmar_unit *unit, dmar_pte_t *dst)
380 dmar_flush_transl_to_ram(unit, dst, sizeof(*dst));
384 dmar_flush_ctx_to_ram(struct dmar_unit *unit, dmar_ctx_entry_t *dst)
387 dmar_flush_transl_to_ram(unit, dst, sizeof(*dst));
391 dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst)
394 dmar_flush_transl_to_ram(unit, dst, sizeof(*dst));
398 * Load the root entry pointer into the hardware, busily waiting for
402 dmar_load_root_entry_ptr(struct dmar_unit *unit)
404 vm_page_t root_entry;
408 * Access to the GCMD register must be serialized while the
409 * command is submitted.
411 DMAR_ASSERT_LOCKED(unit);
413 VM_OBJECT_RLOCK(unit->ctx_obj);
414 root_entry = vm_page_lookup(unit->ctx_obj, 0);
415 VM_OBJECT_RUNLOCK(unit->ctx_obj);
416 dmar_write8(unit, DMAR_RTADDR_REG, VM_PAGE_TO_PHYS(root_entry));
417 dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SRTP);
418 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS)
424 * Globally invalidate the context entries cache, busily waiting for
428 dmar_inv_ctx_glob(struct dmar_unit *unit)
433 * Access to the CCMD register must be serialized while the
434 * command is submitted.
436 DMAR_ASSERT_LOCKED(unit);
437 KASSERT(!unit->qi_enabled, ("QI enabled"));
440 * The DMAR_CCMD_ICC bit in the upper dword should be written
441 * after the low dword write is completed. Amd64
442 * dmar_write8() does not have this issue, i386 dmar_write8()
443 * writes the upper dword last.
445 dmar_write8(unit, DMAR_CCMD_REG, DMAR_CCMD_ICC | DMAR_CCMD_CIRG_GLOB);
446 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32)
452 * Globally invalidate the IOTLB, busily waiting for the completion.
455 dmar_inv_iotlb_glob(struct dmar_unit *unit)
459 DMAR_ASSERT_LOCKED(unit);
460 KASSERT(!unit->qi_enabled, ("QI enabled"));
462 reg = 16 * DMAR_ECAP_IRO(unit->hw_ecap);
463 /* See a comment about DMAR_CCMD_ICC in dmar_inv_ctx_glob. */
464 dmar_write8(unit, reg + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT |
465 DMAR_IOTLB_IIRG_GLB | DMAR_IOTLB_DR | DMAR_IOTLB_DW);
466 DMAR_WAIT_UNTIL(((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
467 DMAR_IOTLB_IVT32) == 0));
472 * Flush the chipset write buffers. See 11.1 "Write Buffer Flushing"
473 * in the architecture specification.
476 dmar_flush_write_bufs(struct dmar_unit *unit)
480 DMAR_ASSERT_LOCKED(unit);
483 * DMAR_GCMD_WBF is only valid when CAP_RWBF is reported.
485 KASSERT((unit->hw_cap & DMAR_CAP_RWBF) != 0,
486 ("dmar%d: no RWBF", unit->unit));
488 dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_WBF);
489 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS)
495 dmar_enable_translation(struct dmar_unit *unit)
499 DMAR_ASSERT_LOCKED(unit);
500 unit->hw_gcmd |= DMAR_GCMD_TE;
501 dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
502 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
508 dmar_disable_translation(struct dmar_unit *unit)
512 DMAR_ASSERT_LOCKED(unit);
513 unit->hw_gcmd &= ~DMAR_GCMD_TE;
514 dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
515 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
521 dmar_load_irt_ptr(struct dmar_unit *unit)
526 DMAR_ASSERT_LOCKED(unit);
527 irta = unit->irt_phys;
528 if (DMAR_X2APIC(unit))
529 irta |= DMAR_IRTA_EIME;
530 s = fls(unit->irte_cnt) - 2;
531 KASSERT(unit->irte_cnt >= 2 && s <= DMAR_IRTA_S_MASK &&
532 powerof2(unit->irte_cnt),
533 ("IRTA_REG_S overflow %x", unit->irte_cnt));
535 dmar_write8(unit, DMAR_IRTA_REG, irta);
536 dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SIRTP);
537 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRTPS)
543 dmar_enable_ir(struct dmar_unit *unit)
547 DMAR_ASSERT_LOCKED(unit);
548 unit->hw_gcmd |= DMAR_GCMD_IRE;
549 unit->hw_gcmd &= ~DMAR_GCMD_CFI;
550 dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
551 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
557 dmar_disable_ir(struct dmar_unit *unit)
561 DMAR_ASSERT_LOCKED(unit);
562 unit->hw_gcmd &= ~DMAR_GCMD_IRE;
563 dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
564 DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
570 u_int f_done, f_inproc, f_wakeup; \
572 f_done = 1 << (barrier_id * 3); \
573 f_inproc = 1 << (barrier_id * 3 + 1); \
574 f_wakeup = 1 << (barrier_id * 3 + 2)
577 dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id)
582 if ((dmar->barrier_flags & f_done) != 0) {
587 if ((dmar->barrier_flags & f_inproc) != 0) {
588 while ((dmar->barrier_flags & f_inproc) != 0) {
589 dmar->barrier_flags |= f_wakeup;
590 msleep(&dmar->barrier_flags, &dmar->lock, 0,
593 KASSERT((dmar->barrier_flags & f_done) != 0,
594 ("dmar%d barrier %d missing done", dmar->unit, barrier_id));
599 dmar->barrier_flags |= f_inproc;
605 dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id)
609 DMAR_ASSERT_LOCKED(dmar);
610 KASSERT((dmar->barrier_flags & (f_done | f_inproc)) == f_inproc,
611 ("dmar%d barrier %d missed entry", dmar->unit, barrier_id));
612 dmar->barrier_flags |= f_done;
613 if ((dmar->barrier_flags & f_wakeup) != 0)
614 wakeup(&dmar->barrier_flags);
615 dmar->barrier_flags &= ~(f_inproc | f_wakeup);
619 int dmar_match_verbose;
620 int dmar_batch_coalesce = 100;
621 struct timespec dmar_hw_timeout = {
626 static const uint64_t d = 1000000000;
629 dmar_update_timeout(uint64_t newval)
632 /* XXXKIB not atomic */
633 dmar_hw_timeout.tv_sec = newval / d;
634 dmar_hw_timeout.tv_nsec = newval % d;
638 dmar_get_timeout(void)
641 return ((uint64_t)dmar_hw_timeout.tv_sec * d +
642 dmar_hw_timeout.tv_nsec);
646 dmar_timeout_sysctl(SYSCTL_HANDLER_ARGS)
651 val = dmar_get_timeout();
652 error = sysctl_handle_long(oidp, &val, 0, req);
653 if (error != 0 || req->newptr == NULL)
655 dmar_update_timeout(val);
659 static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, "");
660 SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD,
661 &dmar_tbl_pagecnt, 0,
662 "Count of pages used for DMAR pagetables");
663 SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RWTUN,
664 &dmar_match_verbose, 0,
665 "Verbose matching of the PCI devices to DMAR paths");
666 SYSCTL_INT(_hw_dmar, OID_AUTO, batch_coalesce, CTLFLAG_RWTUN,
667 &dmar_batch_coalesce, 0,
668 "Number of qi batches between interrupt");
669 SYSCTL_PROC(_hw_dmar, OID_AUTO, timeout,
670 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
671 dmar_timeout_sysctl, "QU",
672 "Timeout for command wait, in nanoseconds");
675 SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RWTUN,
677 "Check the GPA RBtree for free_down and free_after validity");