2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Machine dependent interrupt code for x86. For x86, we have to
32 * deal with different PICs. Thus, we use the passed in vector to lookup
33 * an interrupt source associated with that vector. The interrupt source
34 * describes which PIC the source belongs to and includes methods to handle
38 #include "opt_atpic.h"
42 #include <sys/param.h>
44 #include <sys/interrupt.h>
46 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49 #include <sys/mutex.h>
51 #include <sys/queue.h>
55 #include <sys/sysctl.h>
56 #include <sys/syslog.h>
57 #include <sys/systm.h>
58 #include <sys/taskqueue.h>
59 #include <sys/vmmeter.h>
60 #include <machine/clock.h>
61 #include <machine/intr_machdep.h>
62 #include <machine/smp.h>
68 #include <machine/segments.h>
69 #include <machine/frame.h>
70 #include <dev/ic/i8259.h>
71 #include <x86/isa/icu.h>
72 #include <isa/isareg.h>
77 #define MAX_STRAY_LOG 5
79 typedef void (*mask_fn)(void *);
81 static int intrcnt_index;
82 static struct intsrc **interrupt_sources;
84 static struct intsrc **interrupt_sorted;
85 static int intrbalance;
86 SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RWTUN, &intrbalance, 0,
87 "Interrupt auto-balance interval (seconds). Zero disables.");
88 static struct timeout_task intrbalance_task;
90 static struct sx intrsrc_lock;
91 static struct mtx intrpic_lock;
92 static struct mtx intrcnt_lock;
93 static TAILQ_HEAD(pics_head, pic) pics;
96 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
97 static int assign_cpu;
102 size_t sintrcnt = sizeof(intrcnt);
103 size_t sintrnames = sizeof(intrnames);
106 static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
108 static int intr_assign_cpu(void *arg, int cpu);
109 static void intr_disable_src(void *arg);
110 static void intr_init(void *__dummy);
111 static int intr_pic_registered(struct pic *pic);
112 static void intrcnt_setname(const char *name, int index);
113 static void intrcnt_updatename(struct intsrc *is);
114 static void intrcnt_register(struct intsrc *is);
117 * SYSINIT levels for SI_SUB_INTR:
119 * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
120 * SI_ORDER_SECOND: Xen PICs
121 * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
122 * SI_ORDER_FOURTH: Add 8259A PICs
123 * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
124 * SI_ORDER_MIDDLE: SMP interrupt counters
125 * SI_ORDER_ANY: Enable interrupts on BSP
129 intr_pic_registered(struct pic *pic)
133 TAILQ_FOREACH(p, &pics, pics) {
141 * Register a new interrupt controller (PIC). This is to support suspend
142 * and resume where we suspend/resume controllers rather than individual
143 * sources. This also allows controllers with no active sources (such as
144 * 8259As in a system using the APICs) to participate in suspend and resume.
147 intr_register_pic(struct pic *pic)
151 mtx_lock(&intrpic_lock);
152 if (intr_pic_registered(pic))
155 TAILQ_INSERT_TAIL(&pics, pic, pics);
158 mtx_unlock(&intrpic_lock);
163 * Allocate interrupt source arrays and register interrupt sources
164 * once the number of interrupts is known.
167 intr_init_sources(void *arg)
171 MPASS(num_io_irqs > 0);
173 interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
174 M_INTR, M_WAITOK | M_ZERO);
176 interrupt_sorted = mallocarray(num_io_irqs, sizeof(*interrupt_sorted),
177 M_INTR, M_WAITOK | M_ZERO);
181 * - 1 ??? dummy counter.
182 * - 2 counters for each I/O interrupt.
183 * - 1 counter for each CPU for lapic timer.
184 * - 1 counter for each CPU for the Hyper-V vmbus driver.
185 * - 8 counters for each CPU for IPI counters for SMP.
187 nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
190 nintrcnt += 8 * mp_ncpus;
192 intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
194 intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK |
196 sintrcnt = nintrcnt * sizeof(u_long);
197 sintrnames = nintrcnt * (MAXCOMLEN + 1);
199 intrcnt_setname("???", 0);
203 * NB: intrpic_lock is not held here to avoid LORs due to
204 * malloc() in intr_register_source(). However, we are still
205 * single-threaded at this point in startup so the list of
206 * PICs shouldn't change.
208 TAILQ_FOREACH(pic, &pics, pics) {
209 if (pic->pic_register_sources != NULL)
210 pic->pic_register_sources(pic);
213 SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
217 * Register a new interrupt source with the global interrupt system.
218 * The global interrupts need to be disabled when this function is
222 intr_register_source(struct intsrc *isrc)
226 KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
227 vector = isrc->is_pic->pic_vector(isrc);
228 KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
230 if (interrupt_sources[vector] != NULL)
232 error = intr_event_create(&isrc->is_event, isrc, 0, vector,
233 intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
234 (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
238 sx_xlock(&intrsrc_lock);
239 if (interrupt_sources[vector] != NULL) {
240 sx_xunlock(&intrsrc_lock);
241 intr_event_destroy(isrc->is_event);
244 intrcnt_register(isrc);
245 interrupt_sources[vector] = isrc;
246 isrc->is_handlers = 0;
247 sx_xunlock(&intrsrc_lock);
252 intr_lookup_source(int vector)
255 if (vector < 0 || vector >= num_io_irqs)
257 return (interrupt_sources[vector]);
261 intr_add_handler(const char *name, int vector, driver_filter_t filter,
262 driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
268 isrc = intr_lookup_source(vector);
271 error = intr_event_add_handler(isrc->is_event, name, filter, handler,
272 arg, intr_priority(flags), flags, cookiep);
274 sx_xlock(&intrsrc_lock);
275 intrcnt_updatename(isrc);
277 if (isrc->is_handlers == 1) {
278 isrc->is_domain = domain;
279 isrc->is_pic->pic_enable_intr(isrc);
280 isrc->is_pic->pic_enable_source(isrc);
282 sx_xunlock(&intrsrc_lock);
288 intr_remove_handler(void *cookie)
293 isrc = intr_handler_source(cookie);
294 error = intr_event_remove_handler(cookie);
296 sx_xlock(&intrsrc_lock);
298 if (isrc->is_handlers == 0) {
299 isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
300 isrc->is_pic->pic_disable_intr(isrc);
302 intrcnt_updatename(isrc);
303 sx_xunlock(&intrsrc_lock);
309 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
313 isrc = intr_lookup_source(vector);
316 return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
320 intr_disable_src(void *arg)
325 isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
329 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
331 struct intr_event *ie;
335 * We count software interrupts when we process them. The
336 * code here follows previous practice, but there's an
337 * argument for counting hardware interrupts when they're
346 * XXX: We assume that IRQ 0 is only used for the ISA timer
349 vector = isrc->is_pic->pic_vector(isrc);
354 * For stray interrupts, mask and EOI the source, bump the
355 * stray count, and log the condition.
357 if (intr_event_handle(ie, frame) != 0) {
358 isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
359 (*isrc->is_straycount)++;
360 if (*isrc->is_straycount < MAX_STRAY_LOG)
361 log(LOG_ERR, "stray irq%d\n", vector);
362 else if (*isrc->is_straycount == MAX_STRAY_LOG)
364 "too many stray irq %d's: not logging anymore\n",
370 intr_resume(bool suspend_cancelled)
377 mtx_lock(&intrpic_lock);
378 TAILQ_FOREACH(pic, &pics, pics) {
379 if (pic->pic_resume != NULL)
380 pic->pic_resume(pic, suspend_cancelled);
382 mtx_unlock(&intrpic_lock);
390 mtx_lock(&intrpic_lock);
391 TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
392 if (pic->pic_suspend != NULL)
393 pic->pic_suspend(pic);
395 mtx_unlock(&intrpic_lock);
399 intr_assign_cpu(void *arg, int cpu)
405 #ifdef EARLY_AP_STARTUP
406 MPASS(mp_ncpus == 1 || smp_started);
408 /* Nothing to do if there is only a single CPU. */
409 if (mp_ncpus > 1 && cpu != NOCPU) {
412 * Don't do anything during early boot. We will pick up the
413 * assignment once the APs are started.
415 if (assign_cpu && cpu != NOCPU) {
418 sx_xlock(&intrsrc_lock);
419 error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
422 sx_xunlock(&intrsrc_lock);
432 intrcnt_setname(const char *name, int index)
435 snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
440 intrcnt_updatename(struct intsrc *is)
443 intrcnt_setname(is->is_event->ie_fullname, is->is_index);
447 intrcnt_register(struct intsrc *is)
449 char straystr[MAXCOMLEN + 1];
451 KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
452 mtx_lock_spin(&intrcnt_lock);
453 MPASS(intrcnt_index + 2 <= nintrcnt);
454 is->is_index = intrcnt_index;
456 snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
457 is->is_pic->pic_vector(is));
458 intrcnt_updatename(is);
459 is->is_count = &intrcnt[is->is_index];
460 intrcnt_setname(straystr, is->is_index + 1);
461 is->is_straycount = &intrcnt[is->is_index + 1];
462 mtx_unlock_spin(&intrcnt_lock);
466 intrcnt_add(const char *name, u_long **countp)
469 mtx_lock_spin(&intrcnt_lock);
470 MPASS(intrcnt_index < nintrcnt);
471 *countp = &intrcnt[intrcnt_index];
472 intrcnt_setname(name, intrcnt_index);
474 mtx_unlock_spin(&intrcnt_lock);
478 intr_init(void *dummy __unused)
482 mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
483 sx_init(&intrsrc_lock, "intrsrc");
484 mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
486 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
489 intr_init_final(void *dummy __unused)
493 * Enable interrupts on the BSP after all of the interrupt
494 * controllers are initialized. Device interrupts are still
495 * disabled in the interrupt controllers until interrupt
496 * handlers are registered. Interrupts are enabled on each AP
497 * after their first context switch.
501 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
504 /* Initialize the two 8259A's to a known-good shutdown state. */
509 outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
510 outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
511 outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
512 outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
513 outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
514 outb(IO_ICU1, OCW3_SEL | OCW3_RR);
516 outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
517 outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
518 outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
519 outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
520 outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
521 outb(IO_ICU2, OCW3_SEL | OCW3_RR);
525 /* Add a description to an active interrupt handler. */
527 intr_describe(u_int vector, void *ih, const char *descr)
532 isrc = intr_lookup_source(vector);
535 error = intr_event_describe_handler(isrc->is_event, ih, descr);
538 intrcnt_updatename(isrc);
548 sx_xlock(&intrsrc_lock);
549 for (v = 0; v < num_io_irqs; v++) {
550 is = interrupt_sources[v];
553 if (is->is_pic->pic_reprogram_pin != NULL)
554 is->is_pic->pic_reprogram_pin(is);
556 sx_xunlock(&intrsrc_lock);
561 * Dump data about interrupt handlers
563 DB_SHOW_COMMAND(irqs, db_show_irqs)
565 struct intsrc **isrc;
569 if (strcmp(modif, "v") == 0)
573 isrc = interrupt_sources;
574 for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
576 db_dump_intr_event((*isrc)->is_event, verbose);
582 * Support for balancing interrupt sources across CPUs. For now we just
583 * allocate CPUs round-robin.
586 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
587 static int current_cpu[MAXMEMDOM];
594 for (i = 0; i < vm_ndomains; i++) {
596 if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
597 !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
603 * Return the CPU that the next interrupt source should use. For now
604 * this just returns the next local APIC according to round-robin.
607 intr_next_cpu(int domain)
611 #ifdef EARLY_AP_STARTUP
612 MPASS(mp_ncpus == 1 || smp_started);
614 return (PCPU_GET(apic_id));
616 /* Leave all interrupts on the BSP during boot. */
618 return (PCPU_GET(apic_id));
621 mtx_lock_spin(&icu_lock);
622 apic_id = cpu_apic_ids[current_cpu[domain]];
624 current_cpu[domain]++;
625 if (current_cpu[domain] > mp_maxid)
626 current_cpu[domain] = 0;
627 } while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
628 !CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]));
629 mtx_unlock_spin(&icu_lock);
633 /* Attempt to bind the specified IRQ to the specified CPU. */
635 intr_bind(u_int vector, u_char cpu)
639 isrc = intr_lookup_source(vector);
642 return (intr_event_bind(isrc->is_event, cpu));
646 * Add a CPU to our mask of valid CPUs that can be destinations of
650 intr_add_cpu(u_int cpu)
654 panic("%s: Invalid CPU ID", __func__);
656 printf("INTR: Adding local APIC %d as a target\n",
659 CPU_SET(cpu, &intr_cpus);
662 #ifdef EARLY_AP_STARTUP
664 intr_smp_startup(void *arg __unused)
670 SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
675 * Distribute all the interrupt sources among the available CPUs once the
676 * AP's have been launched.
679 intr_shuffle_irqs(void *arg __unused)
685 /* Don't bother on UP. */
689 /* Round-robin assign a CPU to each enabled source. */
690 sx_xlock(&intrsrc_lock);
692 for (i = 0; i < num_io_irqs; i++) {
693 isrc = interrupt_sources[i];
694 if (isrc != NULL && isrc->is_handlers > 0) {
696 * If this event is already bound to a CPU,
697 * then assign the source to that CPU instead
698 * of picking one via round-robin. Note that
699 * this is careful to only advance the
700 * round-robin if the CPU assignment succeeds.
702 cpu = isrc->is_event->ie_cpu;
704 cpu = current_cpu[isrc->is_domain];
705 if (isrc->is_pic->pic_assign_cpu(isrc,
706 cpu_apic_ids[cpu]) == 0) {
708 if (isrc->is_event->ie_cpu == NOCPU)
709 intr_next_cpu(isrc->is_domain);
713 sx_xunlock(&intrsrc_lock);
715 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
720 * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
723 sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
730 error = sysctl_wire_old_buffer(req, 0);
734 sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
735 sx_slock(&intrsrc_lock);
736 for (i = 0; i < num_io_irqs; i++) {
737 isrc = interrupt_sources[i];
740 sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
741 isrc->is_event->ie_fullname,
748 sx_sunlock(&intrsrc_lock);
749 error = sbuf_finish(&sbuf);
753 SYSCTL_PROC(_hw, OID_AUTO, intrs,
754 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
755 0, 0, sysctl_hw_intrs, "A",
756 "interrupt:number @cpu: count");
759 * Compare two, possibly NULL, entries in the interrupt source array
763 intrcmp(const void *one, const void *two)
765 const struct intsrc *i1, *i2;
767 i1 = *(const struct intsrc * const *)one;
768 i2 = *(const struct intsrc * const *)two;
769 if (i1 != NULL && i2 != NULL)
770 return (*i1->is_count - *i2->is_count);
779 * Balance IRQs across available CPUs according to load.
782 intr_balance(void *dummy __unused, int pending __unused)
789 interval = intrbalance;
794 * Sort interrupts according to count.
796 sx_xlock(&intrsrc_lock);
797 memcpy(interrupt_sorted, interrupt_sources, num_io_irqs *
798 sizeof(interrupt_sorted[0]));
799 qsort(interrupt_sorted, num_io_irqs, sizeof(interrupt_sorted[0]),
803 * Restart the scan from the same location to avoid moving in the
809 * Assign round-robin from most loaded to least.
811 for (i = num_io_irqs - 1; i >= 0; i--) {
812 isrc = interrupt_sorted[i];
813 if (isrc == NULL || isrc->is_event->ie_cpu != NOCPU)
815 cpu = current_cpu[isrc->is_domain];
816 intr_next_cpu(isrc->is_domain);
817 if (isrc->is_cpu != cpu &&
818 isrc->is_pic->pic_assign_cpu(isrc,
819 cpu_apic_ids[cpu]) == 0)
822 sx_xunlock(&intrsrc_lock);
824 taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
825 interval ? hz * interval : hz * 60);
830 intr_balance_init(void *dummy __unused)
833 TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
835 taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
837 SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
841 * Always route interrupts to the current processor in the UP case.
844 intr_next_cpu(int domain)
847 return (PCPU_GET(apic_id));