2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright 2019 Justin Hibbits
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
20 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
22 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_platform.h"
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/module.h>
38 #include <sys/endian.h>
39 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mutex.h>
48 #include <machine/bus.h>
49 #include <machine/intr_machdep.h>
50 #include <machine/md_var.h>
52 #include <dev/ofw/ofw_bus.h>
53 #include <dev/ofw/ofw_bus_subr.h>
56 #include <powerpc/powernv/opal.h>
61 #define XIVE_PRIORITY 7 /* Random non-zero number */
62 #define MAX_XIVE_IRQS (1<<24) /* 24-bit XIRR field */
65 #define XIVE_TM_QW1_OS 0x010 /* Guest OS registers */
66 #define XIVE_TM_QW2_HV_POOL 0x020 /* Hypervisor pool registers */
67 #define XIVE_TM_QW3_HV 0x030 /* Hypervisor registers */
69 #define XIVE_TM_NSR 0x00
70 #define XIVE_TM_CPPR 0x01
71 #define XIVE_TM_IPB 0x02
72 #define XIVE_TM_LSMFB 0x03
73 #define XIVE_TM_ACK_CNT 0x04
74 #define XIVE_TM_INC 0x05
75 #define XIVE_TM_AGE 0x06
76 #define XIVE_TM_PIPR 0x07
80 #define TM_QW2W2_VP 0x80000000
82 #define XIVE_TM_SPC_ACK 0x800
83 #define TM_QW3NSR_HE_SHIFT 14
84 #define TM_QW3_NSR_HE_NONE 0
85 #define TM_QW3_NSR_HE_POOL 1
86 #define TM_QW3_NSR_HE_PHYS 2
87 #define TM_QW3_NSR_HE_LSI 3
88 #define XIVE_TM_SPC_PULL_POOL_CTX 0x828
90 #define XIVE_IRQ_LOAD_EOI 0x000
91 #define XIVE_IRQ_STORE_EOI 0x400
92 #define XIVE_IRQ_PQ_00 0xc00
93 #define XIVE_IRQ_PQ_01 0xd00
95 #define XIVE_IRQ_VAL_P 0x02
96 #define XIVE_IRQ_VAL_Q 0x01
101 extern void (*powernv_smp_ap_extra_init)(void);
103 /* Private support */
104 static void xive_setup_cpu(void);
105 static void xive_smp_cpu_startup(void);
106 static void xive_init_irq(struct xive_irq *irqd, u_int irq);
107 static struct xive_irq *xive_configure_irq(u_int irq);
108 static int xive_provision_page(struct xive_softc *sc);
111 static int xive_probe(device_t);
112 static int xive_attach(device_t);
113 static int xics_probe(device_t);
114 static int xics_attach(device_t);
116 static void xive_bind(device_t, u_int, cpuset_t, void **);
117 static void xive_dispatch(device_t, struct trapframe *);
118 static void xive_enable(device_t, u_int, u_int, void **);
119 static void xive_eoi(device_t, u_int, void *);
120 static void xive_ipi(device_t, u_int);
121 static void xive_mask(device_t, u_int, void *);
122 static void xive_unmask(device_t, u_int, void *);
123 static void xive_translate_code(device_t dev, u_int irq, int code,
124 enum intr_trigger *trig, enum intr_polarity *pol);
126 static device_method_t xive_methods[] = {
127 /* Device interface */
128 DEVMETHOD(device_probe, xive_probe),
129 DEVMETHOD(device_attach, xive_attach),
132 DEVMETHOD(pic_bind, xive_bind),
133 DEVMETHOD(pic_dispatch, xive_dispatch),
134 DEVMETHOD(pic_enable, xive_enable),
135 DEVMETHOD(pic_eoi, xive_eoi),
136 DEVMETHOD(pic_ipi, xive_ipi),
137 DEVMETHOD(pic_mask, xive_mask),
138 DEVMETHOD(pic_unmask, xive_unmask),
139 DEVMETHOD(pic_translate_code, xive_translate_code),
144 static device_method_t xics_methods[] = {
145 /* Device interface */
146 DEVMETHOD(device_probe, xics_probe),
147 DEVMETHOD(device_attach, xics_attach),
154 struct resource *sc_mem;
155 vm_size_t sc_prov_page_size;
161 uint32_t *q_eoi_page;
173 #define OPAL_XIVE_IRQ_SHIFT_BUG 0x00000008
174 #define OPAL_XIVE_IRQ_LSI 0x00000004
175 #define OPAL_XIVE_IRQ_STORE_EOI 0x00000002
176 #define OPAL_XIVE_IRQ_TRIGGER_PAGE 0x00000001
178 vm_offset_t eoi_page;
179 vm_offset_t trig_page;
187 struct xive_irq ipi_data;
188 struct xive_queue queue; /* We only use a single queue for now. */
193 static driver_t xive_driver = {
196 sizeof(struct xive_softc)
199 static driver_t xics_driver = {
205 static devclass_t xive_devclass;
206 static devclass_t xics_devclass;
208 EARLY_DRIVER_MODULE(xive, ofwbus, xive_driver, xive_devclass, 0, 0,
209 BUS_PASS_INTERRUPT-1);
210 EARLY_DRIVER_MODULE(xivevc, ofwbus, xics_driver, xics_devclass, 0, 0,
213 MALLOC_DEFINE(M_XIVE, "xive", "XIVE Memory");
215 DPCPU_DEFINE_STATIC(struct xive_cpu, xive_cpu_data);
217 static int xive_ipi_vector = -1;
220 * XIVE Exploitation mode driver.
222 * The XIVE, present in the POWER9 CPU, can run in two modes: XICS emulation
223 * mode, and "Exploitation mode". XICS emulation mode is compatible with the
224 * POWER8 and earlier XICS interrupt controller, using OPAL calls to emulate
225 * hypervisor calls and memory accesses. Exploitation mode gives us raw access
226 * to the XIVE MMIO, improving performance significantly.
228 * The XIVE controller is a very bizarre interrupt controller. It uses queues
229 * in memory to pass interrupts around, and maps itself into 512GB of physical
230 * device address space, giving each interrupt in the system one or more pages
231 * of address space. An IRQ is tied to a virtual processor, which could be a
232 * physical CPU thread, or a guest CPU thread (LPAR running on a physical
233 * thread). Thus, the controller can route interrupts directly to guest OSes
234 * bypassing processing by the hypervisor, thereby improving performance of the
237 * An IRQ, in addition to being tied to a virtual processor, has one or two
238 * page mappings: an EOI page, and an optional trigger page. The trigger page
239 * could be the same as the EOI page. Level-sensitive interrupts (LSIs) don't
240 * have a trigger page, as they're external interrupts controlled by physical
241 * lines. MSIs and IPIs have trigger pages. An IPI is really just another IRQ
242 * in the XIVE, which is triggered by software.
244 * An interesting behavior of the XIVE controller is that oftentimes the
245 * contents of an address location don't actually matter, but the direction of
246 * the action is the signifier (read vs write), and the address is significant.
247 * Hence, masking and unmasking an interrupt is done by reading different
248 * addresses in the EOI page, and triggering an interrupt consists of writing to
251 * Additionally, the MMIO region mapped is CPU-sensitive, just like the
252 * per-processor register space (private access) in OpenPIC. In order for a CPU
253 * to receive interrupts it must itself configure its CPPR (Current Processor
254 * Priority Register), it cannot be set by any other processor. This
255 * necessitates the xive_smp_cpu_startup() function.
257 * Queues are pages of memory, sized powers-of-two, that are shared with the
258 * XIVE. The XIVE writes into the queue with an alternating polarity bit, which
259 * flips when the queue wraps.
263 * Offset-based read/write interfaces.
266 xive_read_2(struct xive_softc *sc, bus_size_t offset)
269 return (bus_read_2(sc->sc_mem, sc->sc_offset + offset));
273 xive_write_1(struct xive_softc *sc, bus_size_t offset, uint8_t val)
276 bus_write_1(sc->sc_mem, sc->sc_offset + offset, val);
279 /* EOI and Trigger page access interfaces. */
281 xive_read_mmap8(vm_offset_t addr)
283 return (*(volatile uint64_t *)addr);
287 xive_write_mmap8(vm_offset_t addr, uint64_t val)
289 *(uint64_t *)(addr) = val;
292 /* Device interfaces. */
294 xive_probe(device_t dev)
297 if (!ofw_bus_is_compatible(dev, "ibm,opal-xive-pe"))
300 device_set_desc(dev, "External Interrupt Virtualization Engine");
302 /* Make sure we always win against the xicp driver. */
303 return (BUS_PROBE_DEFAULT);
307 xics_probe(device_t dev)
310 if (!ofw_bus_is_compatible(dev, "ibm,opal-xive-vc"))
313 device_set_desc(dev, "External Interrupt Virtualization Engine Root");
314 return (BUS_PROBE_DEFAULT);
318 xive_attach(device_t dev)
320 struct xive_softc *sc = device_get_softc(dev);
321 struct xive_cpu *xive_cpud;
322 phandle_t phandle = ofw_bus_get_node(dev);
330 opal_call(OPAL_XIVE_RESET, OPAL_XIVE_XICS_MODE_EXP);
332 error = OF_getencprop(phandle, "ibm,xive-provision-page-size",
333 (pcell_t *)&sc->sc_prov_page_size, sizeof(sc->sc_prov_page_size));
335 rid = 1; /* Get the Hypervisor-level register set. */
336 sc->sc_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
338 sc->sc_offset = XIVE_TM_QW3_HV;
340 mtx_init(&sc->sc_mtx, "XIVE", NULL, MTX_DEF);
342 /* Workaround for qemu single-thread powernv */
346 order = fls(mp_maxid + (mp_maxid - 1)) - 1;
349 vp_block = opal_call(OPAL_XIVE_ALLOCATE_VP_BLOCK, order);
350 if (vp_block == OPAL_BUSY)
352 else if (vp_block == OPAL_XIVE_PROVISIONING)
353 xive_provision_page(sc);
360 "Unable to allocate VP block. Opal error %d\n",
362 bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->sc_mem);
367 * Set up the VPs. Try to do as much as we can in attach, to lessen
368 * what's needed at AP spawn time.
371 vp_id = pcpu_find(i)->pc_hwref;
373 xive_cpud = DPCPU_ID_PTR(i, xive_cpu_data);
374 xive_cpud->vp = vp_id + vp_block;
375 opal_call(OPAL_XIVE_GET_VP_INFO, xive_cpud->vp, NULL,
376 vtophys(&xive_cpud->cam), NULL, vtophys(&xive_cpud->chip));
378 xive_cpud->cam = be64toh(xive_cpud->cam);
379 xive_cpud->chip = be64toh(xive_cpud->chip);
381 /* Allocate the queue page and populate the queue state data. */
382 xive_cpud->queue.q_page = contigmalloc(PAGE_SIZE, M_XIVE,
383 M_ZERO | M_WAITOK, 0, BUS_SPACE_MAXADDR, PAGE_SIZE, 0);
384 xive_cpud->queue.q_size = 1 << PAGE_SHIFT;
385 xive_cpud->queue.q_mask =
386 ((xive_cpud->queue.q_size / sizeof(int)) - 1);
387 xive_cpud->queue.q_toggle = 0;
388 xive_cpud->queue.q_index = 0;
390 error = opal_call(OPAL_XIVE_SET_VP_INFO, xive_cpud->vp,
391 OPAL_XIVE_VP_ENABLED, 0);
392 } while (error == OPAL_BUSY);
393 error = opal_call(OPAL_XIVE_SET_QUEUE_INFO, vp_id,
394 XIVE_PRIORITY, vtophys(xive_cpud->queue.q_page), PAGE_SHIFT,
395 OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED);
398 ipi_irq = opal_call(OPAL_XIVE_ALLOCATE_IRQ,
400 } while (ipi_irq == OPAL_BUSY);
403 device_printf(root_pic,
404 "Failed allocating IPI. OPAL error %d\n",
407 xive_init_irq(&xive_cpud->ipi_data, ipi_irq);
408 xive_cpud->ipi_data.vp = vp_id;
409 xive_cpud->ipi_data.lirq = MAX_XIVE_IRQS;
410 opal_call(OPAL_XIVE_SET_IRQ_CONFIG, ipi_irq,
411 xive_cpud->ipi_data.vp, XIVE_PRIORITY,
416 powerpc_register_pic(dev, OF_xref_from_node(phandle), MAX_XIVE_IRQS,
417 1 /* Number of IPIs */, FALSE);
421 powernv_smp_ap_extra_init = xive_smp_cpu_startup;
427 xics_attach(device_t dev)
429 phandle_t phandle = ofw_bus_get_node(dev);
431 /* The XIVE (root PIC) will handle all our interrupts */
432 powerpc_register_pic(root_pic, OF_xref_from_node(phandle),
433 MAX_XIVE_IRQS, 1 /* Number of IPIs */, FALSE);
443 xive_bind(device_t dev, u_int irq, cpuset_t cpumask, void **priv)
445 struct xive_irq *irqd;
450 *priv = xive_configure_irq(irq);
455 * This doesn't appear to actually support affinity groups, so pick a
460 if (CPU_ISSET(cpu, &cpumask)) ncpus++;
465 if (!CPU_ISSET(cpu, &cpumask))
472 opal_call(OPAL_XIVE_SYNC, OPAL_XIVE_SYNC_QUEUE, irq);
474 irqd->vp = pcpu_find(cpu)->pc_hwref;
475 error = opal_call(OPAL_XIVE_SET_IRQ_CONFIG, irq, irqd->vp,
476 XIVE_PRIORITY, irqd->lirq);
479 panic("Cannot bind interrupt %d to CPU %d", irq, cpu);
481 xive_eoi(dev, irq, irqd);
484 /* Read the next entry in the queue page and update the index. */
486 xive_read_eq(struct xive_queue *q)
488 uint32_t i = be32toh(q->q_page[q->q_index]);
490 /* Check validity, using current queue polarity. */
491 if ((i >> 31) == q->q_toggle)
494 q->q_index = (q->q_index + 1) & q->q_mask;
499 return (i & 0x7fffffff);
503 xive_dispatch(device_t dev, struct trapframe *tf)
505 struct xive_softc *sc;
506 struct xive_cpu *xive_cpud;
511 sc = device_get_softc(dev);
513 xive_cpud = DPCPU_PTR(xive_cpu_data);
515 ack = xive_read_2(sc, XIVE_TM_SPC_ACK);
518 he = ack >> TM_QW3NSR_HE_SHIFT;
520 if (he == TM_QW3_NSR_HE_NONE)
523 else if (__predict_false(he != TM_QW3_NSR_HE_PHYS)) {
525 * We don't support TM_QW3_NSR_HE_POOL or
526 * TM_QW3_NSR_HE_LSI interrupts.
529 "Unexpected interrupt he type: %d\n", he);
533 xive_write_1(sc, XIVE_TM_CPPR, cppr);
536 vector = xive_read_eq(&xive_cpud->queue);
541 if (vector == MAX_XIVE_IRQS)
542 vector = xive_ipi_vector;
544 powerpc_dispatch_intr(vector, tf);
548 xive_write_1(sc, XIVE_TM_CPPR, 0xff);
552 xive_enable(device_t dev, u_int irq, u_int vector, void **priv)
554 struct xive_irq *irqd;
557 if (irq == MAX_XIVE_IRQS) {
558 if (xive_ipi_vector == -1)
559 xive_ipi_vector = vector;
563 *priv = xive_configure_irq(irq);
567 /* Bind to this CPU to start */
568 cpu = PCPU_GET(hwref);
572 status = opal_call(OPAL_XIVE_SET_IRQ_CONFIG, irq, cpu,
573 XIVE_PRIORITY, vector);
574 if (status != OPAL_BUSY)
580 panic("OPAL_SET_XIVE IRQ %d -> cpu %d failed: %d", irq,
583 xive_unmask(dev, irq, *priv);
587 xive_eoi(device_t dev, u_int irq, void *priv)
589 struct xive_irq *rirq;
590 struct xive_cpu *cpud;
593 if (irq == MAX_XIVE_IRQS) {
594 cpud = DPCPU_PTR(xive_cpu_data);
595 rirq = &cpud->ipi_data;
599 if (rirq->flags & OPAL_XIVE_IRQ_STORE_EOI)
600 xive_write_mmap8(rirq->eoi_page + XIVE_IRQ_STORE_EOI, 0);
601 else if (rirq->flags & OPAL_XIVE_IRQ_LSI)
602 xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_LOAD_EOI);
604 eoi_val = xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_PQ_00);
605 if ((eoi_val & XIVE_IRQ_VAL_Q) && rirq->trig_page != 0)
606 xive_write_mmap8(rirq->trig_page, 0);
611 xive_ipi(device_t dev, u_int cpu)
613 struct xive_cpu *xive_cpud;
615 xive_cpud = DPCPU_ID_PTR(cpu, xive_cpu_data);
617 if (xive_cpud->ipi_data.trig_page == 0)
619 xive_write_mmap8(xive_cpud->ipi_data.trig_page, 0);
623 xive_mask(device_t dev, u_int irq, void *priv)
625 struct xive_irq *rirq;
627 /* Never mask IPIs */
628 if (irq == MAX_XIVE_IRQS)
633 if (!(rirq->flags & OPAL_XIVE_IRQ_LSI))
635 xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_PQ_01);
639 xive_unmask(device_t dev, u_int irq, void *priv)
641 struct xive_irq *rirq;
645 xive_read_mmap8(rirq->eoi_page + XIVE_IRQ_PQ_00);
649 xive_translate_code(device_t dev, u_int irq, int code,
650 enum intr_trigger *trig, enum intr_polarity *pol)
655 *trig = INTR_TRIGGER_EDGE;
656 *pol = INTR_POLARITY_HIGH;
660 *trig = INTR_TRIGGER_LEVEL;
661 *pol = INTR_POLARITY_LOW;
664 *trig = INTR_TRIGGER_CONFORM;
665 *pol = INTR_POLARITY_CONFORM;
669 /* Private functions. */
671 * Setup the current CPU. Called by the BSP at driver attachment, and by each
672 * AP at wakeup (via xive_smp_cpu_startup()).
677 struct xive_softc *sc;
678 struct xive_cpu *cpup;
681 cpup = DPCPU_PTR(xive_cpu_data);
683 sc = device_get_softc(root_pic);
685 val = bus_read_4(sc->sc_mem, XIVE_TM_QW2_HV_POOL + TM_WORD2);
686 if (val & TM_QW2W2_VP)
687 bus_read_8(sc->sc_mem, XIVE_TM_SPC_PULL_POOL_CTX);
689 bus_write_4(sc->sc_mem, XIVE_TM_QW2_HV_POOL + TM_WORD0, 0xff);
690 bus_write_4(sc->sc_mem, XIVE_TM_QW2_HV_POOL + TM_WORD2,
691 TM_QW2W2_VP | cpup->cam);
693 xive_unmask(root_pic, cpup->ipi_data.girq, &cpup->ipi_data);
694 xive_write_1(sc, XIVE_TM_CPPR, 0xff);
697 /* Populate an IRQ structure, mapping the EOI and trigger pages. */
699 xive_init_irq(struct xive_irq *irqd, u_int irq)
701 uint64_t eoi_phys, trig_phys;
704 opal_call(OPAL_XIVE_GET_IRQ_INFO, irq,
705 vtophys(&irqd->flags), vtophys(&eoi_phys),
706 vtophys(&trig_phys), vtophys(&esb_shift),
707 vtophys(&irqd->chip));
709 irqd->flags = be64toh(irqd->flags);
710 eoi_phys = be64toh(eoi_phys);
711 trig_phys = be64toh(trig_phys);
712 esb_shift = be32toh(esb_shift);
713 irqd->chip = be32toh(irqd->chip);
716 irqd->esb_size = 1 << esb_shift;
717 irqd->eoi_page = (vm_offset_t)pmap_mapdev(eoi_phys, irqd->esb_size);
719 if (eoi_phys == trig_phys)
720 irqd->trig_page = irqd->eoi_page;
721 else if (trig_phys != 0)
722 irqd->trig_page = (vm_offset_t)pmap_mapdev(trig_phys,
727 opal_call(OPAL_XIVE_GET_IRQ_CONFIG, irq, vtophys(&irqd->vp),
728 vtophys(&irqd->prio), vtophys(&irqd->lirq));
730 irqd->vp = be64toh(irqd->vp);
731 irqd->prio = be64toh(irqd->prio);
732 irqd->lirq = be32toh(irqd->lirq);
735 /* Allocate an IRQ struct before populating it. */
736 static struct xive_irq *
737 xive_configure_irq(u_int irq)
739 struct xive_irq *irqd;
741 irqd = malloc(sizeof(struct xive_irq), M_XIVE, M_WAITOK);
743 xive_init_irq(irqd, irq);
749 * Part of the OPAL API. OPAL_XIVE_ALLOCATE_VP_BLOCK might require more pages,
750 * provisioned through this call.
753 xive_provision_page(struct xive_softc *sc)
759 prov_page = contigmalloc(sc->sc_prov_page_size, M_XIVE, 0,
760 0, BUS_SPACE_MAXADDR,
761 sc->sc_prov_page_size, sc->sc_prov_page_size);
763 error = opal_call(OPAL_XIVE_DONATE_PAGE, -1,
765 } while (error == OPAL_XIVE_PROVISIONING);
770 /* The XIVE_TM_CPPR register must be set by each thread */
772 xive_smp_cpu_startup(void)