]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/x86/x86/local_apic.c
Expand IBRS TLA in sysctl help lines.
[FreeBSD/FreeBSD.git] / sys / x86 / x86 / local_apic.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
5  * Copyright (c) 1996, by Steve Passe
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. The name of the developer may NOT be used to endorse or promote products
14  *    derived from this software without specific prior written permission.
15  * 3. Neither the name of the author nor the names of any co-contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 /*
33  * Local APIC support on Pentium and later processors.
34  */
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include "opt_atpic.h"
40 #include "opt_hwpmc_hooks.h"
41
42 #include "opt_ddb.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/bus.h>
47 #include <sys/kernel.h>
48 #include <sys/lock.h>
49 #include <sys/malloc.h>
50 #include <sys/mutex.h>
51 #include <sys/pcpu.h>
52 #include <sys/proc.h>
53 #include <sys/sched.h>
54 #include <sys/smp.h>
55 #include <sys/sysctl.h>
56 #include <sys/timeet.h>
57
58 #include <vm/vm.h>
59 #include <vm/pmap.h>
60
61 #include <x86/apicreg.h>
62 #include <machine/clock.h>
63 #include <machine/cpufunc.h>
64 #include <machine/cputypes.h>
65 #include <machine/frame.h>
66 #include <machine/intr_machdep.h>
67 #include <x86/apicvar.h>
68 #include <x86/mca.h>
69 #include <machine/md_var.h>
70 #include <machine/smp.h>
71 #include <machine/specialreg.h>
72 #include <x86/init.h>
73
74 #ifdef DDB
75 #include <sys/interrupt.h>
76 #include <ddb/ddb.h>
77 #endif
78
79 #ifdef __amd64__
80 #define SDT_APIC        SDT_SYSIGT
81 #define SDT_APICT       SDT_SYSIGT
82 #define GSEL_APIC       0
83 #else
84 #define SDT_APIC        SDT_SYS386IGT
85 #define SDT_APICT       SDT_SYS386TGT
86 #define GSEL_APIC       GSEL(GCODE_SEL, SEL_KPL)
87 #endif
88
89 static MALLOC_DEFINE(M_LAPIC, "local_apic", "Local APIC items");
90
91 /* Sanity checks on IDT vectors. */
92 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
93 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
94 CTASSERT(APIC_LOCAL_INTS == 240);
95 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
96
97 /* Magic IRQ values for the timer and syscalls. */
98 #define IRQ_TIMER       (NUM_IO_INTS + 1)
99 #define IRQ_SYSCALL     (NUM_IO_INTS + 2)
100 #define IRQ_DTRACE_RET  (NUM_IO_INTS + 3)
101 #define IRQ_EVTCHN      (NUM_IO_INTS + 4)
102
103 enum lat_timer_mode {
104         LAT_MODE_UNDEF =        0,
105         LAT_MODE_PERIODIC =     1,
106         LAT_MODE_ONESHOT =      2,
107         LAT_MODE_DEADLINE =     3,
108 };
109
110 /*
111  * Support for local APICs.  Local APICs manage interrupts on each
112  * individual processor as opposed to I/O APICs which receive interrupts
113  * from I/O devices and then forward them on to the local APICs.
114  *
115  * Local APICs can also send interrupts to each other thus providing the
116  * mechanism for IPIs.
117  */
118
119 struct lvt {
120         u_int lvt_edgetrigger:1;
121         u_int lvt_activehi:1;
122         u_int lvt_masked:1;
123         u_int lvt_active:1;
124         u_int lvt_mode:16;
125         u_int lvt_vector:8;
126 };
127
128 struct lapic {
129         struct lvt la_lvts[APIC_LVT_MAX + 1];
130         struct lvt la_elvts[APIC_ELVT_MAX + 1];;
131         u_int la_id:8;
132         u_int la_cluster:4;
133         u_int la_cluster_id:2;
134         u_int la_present:1;
135         u_long *la_timer_count;
136         uint64_t la_timer_period;
137         enum lat_timer_mode la_timer_mode;
138         uint32_t lvt_timer_base;
139         uint32_t lvt_timer_last;
140         /* Include IDT_SYSCALL to make indexing easier. */
141         int la_ioint_irqs[APIC_NUM_IOINTS + 1];
142 } static *lapics;
143
144 /* Global defaults for local APIC LVT entries. */
145 static struct lvt lvts[APIC_LVT_MAX + 1] = {
146         { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },  /* LINT0: masked ExtINT */
147         { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },     /* LINT1: NMI */
148         { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },      /* Timer */
149         { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },      /* Error */
150         { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },     /* PMC */
151         { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },    /* Thermal */
152         { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },        /* CMCI */
153 };
154
155 /* Global defaults for AMD local APIC ELVT entries. */
156 static struct lvt elvts[APIC_ELVT_MAX + 1] = {
157         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
158         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
159         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
160         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
161 };
162
163 static inthand_t *ioint_handlers[] = {
164         NULL,                   /* 0 - 31 */
165         IDTVEC(apic_isr1),      /* 32 - 63 */
166         IDTVEC(apic_isr2),      /* 64 - 95 */
167         IDTVEC(apic_isr3),      /* 96 - 127 */
168         IDTVEC(apic_isr4),      /* 128 - 159 */
169         IDTVEC(apic_isr5),      /* 160 - 191 */
170         IDTVEC(apic_isr6),      /* 192 - 223 */
171         IDTVEC(apic_isr7),      /* 224 - 255 */
172 };
173
174 static inthand_t *ioint_pti_handlers[] = {
175         NULL,                   /* 0 - 31 */
176         IDTVEC(apic_isr1_pti),  /* 32 - 63 */
177         IDTVEC(apic_isr2_pti),  /* 64 - 95 */
178         IDTVEC(apic_isr3_pti),  /* 96 - 127 */
179         IDTVEC(apic_isr4_pti),  /* 128 - 159 */
180         IDTVEC(apic_isr5_pti),  /* 160 - 191 */
181         IDTVEC(apic_isr6_pti),  /* 192 - 223 */
182         IDTVEC(apic_isr7_pti),  /* 224 - 255 */
183 };
184
185 static u_int32_t lapic_timer_divisors[] = {
186         APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
187         APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
188 };
189
190 extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
191
192 volatile char *lapic_map;
193 vm_paddr_t lapic_paddr;
194 int x2apic_mode;
195 int lapic_eoi_suppression;
196 static int lapic_timer_tsc_deadline;
197 static u_long lapic_timer_divisor, count_freq;
198 static struct eventtimer lapic_et;
199 #ifdef SMP
200 static uint64_t lapic_ipi_wait_mult;
201 #endif
202 unsigned int max_apic_id;
203
204 SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
205 SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
206 SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
207     &lapic_eoi_suppression, 0, "");
208 SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
209     &lapic_timer_tsc_deadline, 0, "");
210
211 static uint32_t
212 lapic_read32(enum LAPIC_REGISTERS reg)
213 {
214         uint32_t res;
215
216         if (x2apic_mode) {
217                 res = rdmsr32(MSR_APIC_000 + reg);
218         } else {
219                 res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
220         }
221         return (res);
222 }
223
224 static void
225 lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
226 {
227
228         if (x2apic_mode) {
229                 mfence();
230                 lfence();
231                 wrmsr(MSR_APIC_000 + reg, val);
232         } else {
233                 *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
234         }
235 }
236
237 static void
238 lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
239 {
240
241         if (x2apic_mode) {
242                 wrmsr(MSR_APIC_000 + reg, val);
243         } else {
244                 *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
245         }
246 }
247
248 #ifdef SMP
249 static uint64_t
250 lapic_read_icr(void)
251 {
252         uint64_t v;
253         uint32_t vhi, vlo;
254
255         if (x2apic_mode) {
256                 v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO);
257         } else {
258                 vhi = lapic_read32(LAPIC_ICR_HI);
259                 vlo = lapic_read32(LAPIC_ICR_LO);
260                 v = ((uint64_t)vhi << 32) | vlo;
261         }
262         return (v);
263 }
264
265 static uint64_t
266 lapic_read_icr_lo(void)
267 {
268
269         return (lapic_read32(LAPIC_ICR_LO));
270 }
271
272 static void
273 lapic_write_icr(uint32_t vhi, uint32_t vlo)
274 {
275         uint64_t v;
276
277         if (x2apic_mode) {
278                 v = ((uint64_t)vhi << 32) | vlo;
279                 mfence();
280                 wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
281         } else {
282                 lapic_write32(LAPIC_ICR_HI, vhi);
283                 lapic_write32(LAPIC_ICR_LO, vlo);
284         }
285 }
286 #endif /* SMP */
287
288 static void
289 native_lapic_enable_x2apic(void)
290 {
291         uint64_t apic_base;
292
293         apic_base = rdmsr(MSR_APICBASE);
294         apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
295         wrmsr(MSR_APICBASE, apic_base);
296 }
297
298 static bool
299 native_lapic_is_x2apic(void)
300 {
301         uint64_t apic_base;
302
303         apic_base = rdmsr(MSR_APICBASE);
304         return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
305             (APICBASE_X2APIC | APICBASE_ENABLED));
306 }
307
308 static void     lapic_enable(void);
309 static void     lapic_resume(struct pic *pic, bool suspend_cancelled);
310 static void     lapic_timer_oneshot(struct lapic *);
311 static void     lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
312 static void     lapic_timer_periodic(struct lapic *);
313 static void     lapic_timer_deadline(struct lapic *);
314 static void     lapic_timer_stop(struct lapic *);
315 static void     lapic_timer_set_divisor(u_int divisor);
316 static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value);
317 static int      lapic_et_start(struct eventtimer *et,
318                     sbintime_t first, sbintime_t period);
319 static int      lapic_et_stop(struct eventtimer *et);
320 static u_int    apic_idt_to_irq(u_int apic_id, u_int vector);
321 static void     lapic_set_tpr(u_int vector);
322
323 struct pic lapic_pic = { .pic_resume = lapic_resume };
324
325 /* Forward declarations for apic_ops */
326 static void     native_lapic_create(u_int apic_id, int boot_cpu);
327 static void     native_lapic_init(vm_paddr_t addr);
328 static void     native_lapic_xapic_mode(void);
329 static void     native_lapic_setup(int boot);
330 static void     native_lapic_dump(const char *str);
331 static void     native_lapic_disable(void);
332 static void     native_lapic_eoi(void);
333 static int      native_lapic_id(void);
334 static int      native_lapic_intr_pending(u_int vector);
335 static u_int    native_apic_cpuid(u_int apic_id);
336 static u_int    native_apic_alloc_vector(u_int apic_id, u_int irq);
337 static u_int    native_apic_alloc_vectors(u_int apic_id, u_int *irqs,
338                     u_int count, u_int align);
339 static void     native_apic_disable_vector(u_int apic_id, u_int vector);
340 static void     native_apic_enable_vector(u_int apic_id, u_int vector);
341 static void     native_apic_free_vector(u_int apic_id, u_int vector, u_int irq);
342 static void     native_lapic_set_logical_id(u_int apic_id, u_int cluster,
343                     u_int cluster_id);
344 static int      native_lapic_enable_pmc(void);
345 static void     native_lapic_disable_pmc(void);
346 static void     native_lapic_reenable_pmc(void);
347 static void     native_lapic_enable_cmc(void);
348 static int      native_lapic_enable_mca_elvt(void);
349 static int      native_lapic_set_lvt_mask(u_int apic_id, u_int lvt,
350                     u_char masked);
351 static int      native_lapic_set_lvt_mode(u_int apic_id, u_int lvt,
352                     uint32_t mode);
353 static int      native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
354                     enum intr_polarity pol);
355 static int      native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
356                     enum intr_trigger trigger);
357 #ifdef SMP
358 static void     native_lapic_ipi_raw(register_t icrlo, u_int dest);
359 static void     native_lapic_ipi_vectored(u_int vector, int dest);
360 static int      native_lapic_ipi_wait(int delay);
361 #endif /* SMP */
362 static int      native_lapic_ipi_alloc(inthand_t *ipifunc);
363 static void     native_lapic_ipi_free(int vector);
364
365 struct apic_ops apic_ops = {
366         .create                 = native_lapic_create,
367         .init                   = native_lapic_init,
368         .xapic_mode             = native_lapic_xapic_mode,
369         .is_x2apic              = native_lapic_is_x2apic,
370         .setup                  = native_lapic_setup,
371         .dump                   = native_lapic_dump,
372         .disable                = native_lapic_disable,
373         .eoi                    = native_lapic_eoi,
374         .id                     = native_lapic_id,
375         .intr_pending           = native_lapic_intr_pending,
376         .set_logical_id         = native_lapic_set_logical_id,
377         .cpuid                  = native_apic_cpuid,
378         .alloc_vector           = native_apic_alloc_vector,
379         .alloc_vectors          = native_apic_alloc_vectors,
380         .enable_vector          = native_apic_enable_vector,
381         .disable_vector         = native_apic_disable_vector,
382         .free_vector            = native_apic_free_vector,
383         .enable_pmc             = native_lapic_enable_pmc,
384         .disable_pmc            = native_lapic_disable_pmc,
385         .reenable_pmc           = native_lapic_reenable_pmc,
386         .enable_cmc             = native_lapic_enable_cmc,
387         .enable_mca_elvt        = native_lapic_enable_mca_elvt,
388 #ifdef SMP
389         .ipi_raw                = native_lapic_ipi_raw,
390         .ipi_vectored           = native_lapic_ipi_vectored,
391         .ipi_wait               = native_lapic_ipi_wait,
392 #endif
393         .ipi_alloc              = native_lapic_ipi_alloc,
394         .ipi_free               = native_lapic_ipi_free,
395         .set_lvt_mask           = native_lapic_set_lvt_mask,
396         .set_lvt_mode           = native_lapic_set_lvt_mode,
397         .set_lvt_polarity       = native_lapic_set_lvt_polarity,
398         .set_lvt_triggermode    = native_lapic_set_lvt_triggermode,
399 };
400
401 static uint32_t
402 lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
403 {
404
405         value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
406             APIC_LVT_VECTOR);
407         if (lvt->lvt_edgetrigger == 0)
408                 value |= APIC_LVT_TM;
409         if (lvt->lvt_activehi == 0)
410                 value |= APIC_LVT_IIPP_INTALO;
411         if (lvt->lvt_masked)
412                 value |= APIC_LVT_M;
413         value |= lvt->lvt_mode;
414         switch (lvt->lvt_mode) {
415         case APIC_LVT_DM_NMI:
416         case APIC_LVT_DM_SMI:
417         case APIC_LVT_DM_INIT:
418         case APIC_LVT_DM_EXTINT:
419                 if (!lvt->lvt_edgetrigger && bootverbose) {
420                         printf("lapic%u: Forcing LINT%u to edge trigger\n",
421                             la->la_id, pin);
422                         value &= ~APIC_LVT_TM;
423                 }
424                 /* Use a vector of 0. */
425                 break;
426         case APIC_LVT_DM_FIXED:
427                 value |= lvt->lvt_vector;
428                 break;
429         default:
430                 panic("bad APIC LVT delivery mode: %#x\n", value);
431         }
432         return (value);
433 }
434
435 static uint32_t
436 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
437 {
438         struct lvt *lvt;
439
440         KASSERT(pin <= APIC_LVT_MAX,
441             ("%s: pin %u out of range", __func__, pin));
442         if (la->la_lvts[pin].lvt_active)
443                 lvt = &la->la_lvts[pin];
444         else
445                 lvt = &lvts[pin];
446
447         return (lvt_mode_impl(la, lvt, pin, value));
448 }
449
450 static uint32_t
451 elvt_mode(struct lapic *la, u_int idx, uint32_t value)
452 {
453         struct lvt *elvt;
454
455         KASSERT(idx <= APIC_ELVT_MAX,
456             ("%s: idx %u out of range", __func__, idx));
457
458         elvt = &la->la_elvts[idx];
459         KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
460         KASSERT(elvt->lvt_edgetrigger,
461             ("%s: ELVT%u is not edge triggered", __func__, idx));
462         KASSERT(elvt->lvt_activehi,
463             ("%s: ELVT%u is not active high", __func__, idx));
464         return (lvt_mode_impl(la, elvt, idx, value));
465 }
466
467 /*
468  * Map the local APIC and setup necessary interrupt vectors.
469  */
470 static void
471 native_lapic_init(vm_paddr_t addr)
472 {
473 #ifdef SMP
474         uint64_t r, r1, r2, rx;
475 #endif
476         uint32_t ver;
477         u_int regs[4];
478         int i, arat;
479
480         /*
481          * Enable x2APIC mode if possible. Map the local APIC
482          * registers page.
483          *
484          * Keep the LAPIC registers page mapped uncached for x2APIC
485          * mode too, to have direct map page attribute set to
486          * uncached.  This is needed to work around CPU errata present
487          * on all Intel processors.
488          */
489         KASSERT(trunc_page(addr) == addr,
490             ("local APIC not aligned on a page boundary"));
491         lapic_paddr = addr;
492         lapic_map = pmap_mapdev(addr, PAGE_SIZE);
493         if (x2apic_mode) {
494                 native_lapic_enable_x2apic();
495                 lapic_map = NULL;
496         }
497
498         /* Setup the spurious interrupt handler. */
499         setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
500             GSEL_APIC);
501
502         /* Perform basic initialization of the BSP's local APIC. */
503         lapic_enable();
504
505         /* Set BSP's per-CPU local APIC ID. */
506         PCPU_SET(apic_id, lapic_id());
507
508         /* Local APIC timer interrupt. */
509         setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
510             SDT_APIC, SEL_KPL, GSEL_APIC);
511
512         /* Local APIC error interrupt. */
513         setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
514             SDT_APIC, SEL_KPL, GSEL_APIC);
515
516         /* XXX: Thermal interrupt */
517
518         /* Local APIC CMCI. */
519         setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
520             SDT_APICT, SEL_KPL, GSEL_APIC);
521
522         if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
523                 arat = 0;
524                 /* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */
525                 if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) {
526                         do_cpuid(0x06, regs);
527                         if ((regs[0] & CPUTPM1_ARAT) != 0)
528                                 arat = 1;
529                 } else if (cpu_vendor_id == CPU_VENDOR_AMD &&
530                     CPUID_TO_FAMILY(cpu_id) >= 0x12) {
531                         arat = 1;
532                 }
533                 bzero(&lapic_et, sizeof(lapic_et));
534                 lapic_et.et_name = "LAPIC";
535                 lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT |
536                     ET_FLAGS_PERCPU;
537                 lapic_et.et_quality = 600;
538                 if (!arat) {
539                         lapic_et.et_flags |= ET_FLAGS_C3STOP;
540                         lapic_et.et_quality = 100;
541                 }
542                 if ((cpu_feature & CPUID_TSC) != 0 &&
543                     (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
544                     tsc_is_invariant && tsc_freq != 0) {
545                         lapic_timer_tsc_deadline = 1;
546                         TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
547                             &lapic_timer_tsc_deadline);
548                 }
549
550                 lapic_et.et_frequency = 0;
551                 /* We don't know frequency yet, so trying to guess. */
552                 lapic_et.et_min_period = 0x00001000LL;
553                 lapic_et.et_max_period = SBT_1S;
554                 lapic_et.et_start = lapic_et_start;
555                 lapic_et.et_stop = lapic_et_stop;
556                 lapic_et.et_priv = NULL;
557                 et_register(&lapic_et);
558         }
559
560         /*
561          * Set lapic_eoi_suppression after lapic_enable(), to not
562          * enable suppression in the hardware prematurely.  Note that
563          * we by default enable suppression even when system only has
564          * one IO-APIC, since EOI is broadcasted to all APIC agents,
565          * including CPUs, otherwise.
566          *
567          * It seems that at least some KVM versions report
568          * EOI_SUPPRESSION bit, but auto-EOI does not work.
569          */
570         ver = lapic_read32(LAPIC_VERSION);
571         if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
572                 lapic_eoi_suppression = 1;
573                 if (vm_guest == VM_GUEST_KVM) {
574                         if (bootverbose)
575                                 printf(
576                        "KVM -- disabling lapic eoi suppression\n");
577                         lapic_eoi_suppression = 0;
578                 }
579                 TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
580                     &lapic_eoi_suppression);
581         }
582
583 #ifdef SMP
584 #define LOOPS   100000
585         /*
586          * Calibrate the busy loop waiting for IPI ack in xAPIC mode.
587          * lapic_ipi_wait_mult contains the number of iterations which
588          * approximately delay execution for 1 microsecond (the
589          * argument to native_lapic_ipi_wait() is in microseconds).
590          *
591          * We assume that TSC is present and already measured.
592          * Possible TSC frequency jumps are irrelevant to the
593          * calibration loop below, the CPU clock management code is
594          * not yet started, and we do not enter sleep states.
595          */
596         KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
597             ("TSC not initialized"));
598         if (!x2apic_mode) {
599                 r = rdtsc();
600                 for (rx = 0; rx < LOOPS; rx++) {
601                         (void)lapic_read_icr_lo();
602                         ia32_pause();
603                 }
604                 r = rdtsc() - r;
605                 r1 = tsc_freq * LOOPS;
606                 r2 = r * 1000000;
607                 lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
608                 if (bootverbose) {
609                         printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
610                             "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
611                             (uintmax_t)r, (uintmax_t)tsc_freq);
612                 }
613         }
614 #undef LOOPS
615 #endif /* SMP */
616 }
617
618 /*
619  * Create a local APIC instance.
620  */
621 static void
622 native_lapic_create(u_int apic_id, int boot_cpu)
623 {
624         int i;
625
626         if (apic_id > max_apic_id) {
627                 printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
628                 if (boot_cpu)
629                         panic("Can't ignore BSP");
630                 return;
631         }
632         KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u",
633             apic_id));
634
635         /*
636          * Assume no local LVT overrides and a cluster of 0 and
637          * intra-cluster ID of 0.
638          */
639         lapics[apic_id].la_present = 1;
640         lapics[apic_id].la_id = apic_id;
641         for (i = 0; i <= APIC_LVT_MAX; i++) {
642                 lapics[apic_id].la_lvts[i] = lvts[i];
643                 lapics[apic_id].la_lvts[i].lvt_active = 0;
644         }
645         for (i = 0; i <= APIC_ELVT_MAX; i++) {
646                 lapics[apic_id].la_elvts[i] = elvts[i];
647                 lapics[apic_id].la_elvts[i].lvt_active = 0;
648         }
649         for (i = 0; i <= APIC_NUM_IOINTS; i++)
650             lapics[apic_id].la_ioint_irqs[i] = -1;
651         lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
652         lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
653             IRQ_TIMER;
654 #ifdef KDTRACE_HOOKS
655         lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] =
656             IRQ_DTRACE_RET;
657 #endif
658 #ifdef XENHVM
659         lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN;
660 #endif
661
662
663 #ifdef SMP
664         cpu_add(apic_id, boot_cpu);
665 #endif
666 }
667
668 static inline uint32_t
669 amd_read_ext_features(void)
670 {
671         uint32_t version;
672
673         if (cpu_vendor_id != CPU_VENDOR_AMD)
674                 return (0);
675         version = lapic_read32(LAPIC_VERSION);
676         if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
677                 return (lapic_read32(LAPIC_EXT_FEATURES));
678         else
679                 return (0);
680 }
681
682 static inline uint32_t
683 amd_read_elvt_count(void)
684 {
685         uint32_t extf;
686         uint32_t count;
687
688         extf = amd_read_ext_features();
689         count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
690         count = min(count, APIC_ELVT_MAX + 1);
691         return (count);
692 }
693
694 /*
695  * Dump contents of local APIC registers
696  */
697 static void
698 native_lapic_dump(const char* str)
699 {
700         uint32_t version;
701         uint32_t maxlvt;
702         uint32_t extf;
703         int elvt_count;
704         int i;
705
706         version = lapic_read32(LAPIC_VERSION);
707         maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
708         printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
709         printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
710             lapic_read32(LAPIC_ID), version,
711             lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
712         if ((cpu_feature2 & CPUID2_X2APIC) != 0)
713                 printf(" x2APIC: %d", x2apic_mode);
714         printf("\n  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
715             lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
716             lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
717         printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
718             lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
719             lapic_read32(LAPIC_LVT_ERROR));
720         if (maxlvt >= APIC_LVT_PMC)
721                 printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
722         printf("\n");
723         if (maxlvt >= APIC_LVT_CMCI)
724                 printf("   cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
725         extf = amd_read_ext_features();
726         if (extf != 0) {
727                 printf("   AMD ext features: 0x%08x\n", extf);
728                 elvt_count = amd_read_elvt_count();
729                 for (i = 0; i < elvt_count; i++)
730                         printf("   AMD elvt%d: 0x%08x\n", i,
731                             lapic_read32(LAPIC_EXT_LVT0 + i));
732         }
733 }
734
735 static void
736 native_lapic_xapic_mode(void)
737 {
738         register_t saveintr;
739
740         saveintr = intr_disable();
741         if (x2apic_mode)
742                 native_lapic_enable_x2apic();
743         intr_restore(saveintr);
744 }
745
746 static void
747 native_lapic_setup(int boot)
748 {
749         struct lapic *la;
750         uint32_t version;
751         uint32_t maxlvt;
752         register_t saveintr;
753         char buf[MAXCOMLEN + 1];
754         int elvt_count;
755         int i;
756
757         saveintr = intr_disable();
758
759         la = &lapics[lapic_id()];
760         KASSERT(la->la_present, ("missing APIC structure"));
761         version = lapic_read32(LAPIC_VERSION);
762         maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
763
764         /* Initialize the TPR to allow all interrupts. */
765         lapic_set_tpr(0);
766
767         /* Setup spurious vector and enable the local APIC. */
768         lapic_enable();
769
770         /* Program LINT[01] LVT entries. */
771         lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
772             lapic_read32(LAPIC_LVT_LINT0)));
773         lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
774             lapic_read32(LAPIC_LVT_LINT1)));
775
776         /* Program the PMC LVT entry if present. */
777         if (maxlvt >= APIC_LVT_PMC) {
778                 lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
779                     LAPIC_LVT_PCINT));
780         }
781
782         /* Program timer LVT and setup handler. */
783         la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
784             lapic_read32(LAPIC_LVT_TIMER));
785         la->lvt_timer_last = la->lvt_timer_base;
786         lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
787         if (boot) {
788                 snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid));
789                 intrcnt_add(buf, &la->la_timer_count);
790         }
791
792         /* Setup the timer if configured. */
793         if (la->la_timer_mode != LAT_MODE_UNDEF) {
794                 KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
795                     lapic_id()));
796                 switch (la->la_timer_mode) {
797                 case LAT_MODE_PERIODIC:
798                         lapic_timer_set_divisor(lapic_timer_divisor);
799                         lapic_timer_periodic(la);
800                         break;
801                 case LAT_MODE_ONESHOT:
802                         lapic_timer_set_divisor(lapic_timer_divisor);
803                         lapic_timer_oneshot(la);
804                         break;
805                 case LAT_MODE_DEADLINE:
806                         lapic_timer_deadline(la);
807                         break;
808                 default:
809                         panic("corrupted la_timer_mode %p %d", la,
810                             la->la_timer_mode);
811                 }
812         }
813
814         /* Program error LVT and clear any existing errors. */
815         lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
816             lapic_read32(LAPIC_LVT_ERROR)));
817         lapic_write32(LAPIC_ESR, 0);
818
819         /* XXX: Thermal LVT */
820
821         /* Program the CMCI LVT entry if present. */
822         if (maxlvt >= APIC_LVT_CMCI) {
823                 lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
824                     lapic_read32(LAPIC_LVT_CMCI)));
825         }
826
827         elvt_count = amd_read_elvt_count();
828         for (i = 0; i < elvt_count; i++) {
829                 if (la->la_elvts[i].lvt_active)
830                         lapic_write32(LAPIC_EXT_LVT0 + i,
831                             elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
832         }
833
834         intr_restore(saveintr);
835 }
836
837 static void
838 native_lapic_reenable_pmc(void)
839 {
840 #ifdef HWPMC_HOOKS
841         uint32_t value;
842
843         value = lapic_read32(LAPIC_LVT_PCINT);
844         value &= ~APIC_LVT_M;
845         lapic_write32(LAPIC_LVT_PCINT, value);
846 #endif
847 }
848
849 #ifdef HWPMC_HOOKS
850 static void
851 lapic_update_pmc(void *dummy)
852 {
853         struct lapic *la;
854
855         la = &lapics[lapic_id()];
856         lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
857             lapic_read32(LAPIC_LVT_PCINT)));
858 }
859 #endif
860
861 static int
862 native_lapic_enable_pmc(void)
863 {
864 #ifdef HWPMC_HOOKS
865         u_int32_t maxlvt;
866
867         /* Fail if the local APIC is not present. */
868         if (!x2apic_mode && lapic_map == NULL)
869                 return (0);
870
871         /* Fail if the PMC LVT is not present. */
872         maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
873         if (maxlvt < APIC_LVT_PMC)
874                 return (0);
875
876         lvts[APIC_LVT_PMC].lvt_masked = 0;
877
878 #ifdef EARLY_AP_STARTUP
879         MPASS(mp_ncpus == 1 || smp_started);
880         smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
881 #else
882 #ifdef SMP
883         /*
884          * If hwpmc was loaded at boot time then the APs may not be
885          * started yet.  In that case, don't forward the request to
886          * them as they will program the lvt when they start.
887          */
888         if (smp_started)
889                 smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
890         else
891 #endif
892                 lapic_update_pmc(NULL);
893 #endif
894         return (1);
895 #else
896         return (0);
897 #endif
898 }
899
900 static void
901 native_lapic_disable_pmc(void)
902 {
903 #ifdef HWPMC_HOOKS
904         u_int32_t maxlvt;
905
906         /* Fail if the local APIC is not present. */
907         if (!x2apic_mode && lapic_map == NULL)
908                 return;
909
910         /* Fail if the PMC LVT is not present. */
911         maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
912         if (maxlvt < APIC_LVT_PMC)
913                 return;
914
915         lvts[APIC_LVT_PMC].lvt_masked = 1;
916
917 #ifdef SMP
918         /* The APs should always be started when hwpmc is unloaded. */
919         KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early"));
920 #endif
921         smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
922 #endif
923 }
924
925 static void
926 lapic_calibrate_initcount(struct eventtimer *et, struct lapic *la)
927 {
928         u_long value;
929
930         /* Start off with a divisor of 2 (power on reset default). */
931         lapic_timer_divisor = 2;
932         /* Try to calibrate the local APIC timer. */
933         do {
934                 lapic_timer_set_divisor(lapic_timer_divisor);
935                 lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
936                 DELAY(1000000);
937                 value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
938                 if (value != APIC_TIMER_MAX_COUNT)
939                         break;
940                 lapic_timer_divisor <<= 1;
941         } while (lapic_timer_divisor <= 128);
942         if (lapic_timer_divisor > 128)
943                 panic("lapic: Divisor too big");
944         if (bootverbose) {
945                 printf("lapic: Divisor %lu, Frequency %lu Hz\n",
946                     lapic_timer_divisor, value);
947         }
948         count_freq = value;
949 }
950
951 static void
952 lapic_calibrate_deadline(struct eventtimer *et, struct lapic *la __unused)
953 {
954
955         if (bootverbose) {
956                 printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
957                     (uintmax_t)tsc_freq);
958         }
959 }
960
961 static void
962 lapic_change_mode(struct eventtimer *et, struct lapic *la,
963     enum lat_timer_mode newmode)
964 {
965
966         if (la->la_timer_mode == newmode)
967                 return;
968         switch (newmode) {
969         case LAT_MODE_PERIODIC:
970                 lapic_timer_set_divisor(lapic_timer_divisor);
971                 et->et_frequency = count_freq;
972                 break;
973         case LAT_MODE_DEADLINE:
974                 et->et_frequency = tsc_freq;
975                 break;
976         case LAT_MODE_ONESHOT:
977                 lapic_timer_set_divisor(lapic_timer_divisor);
978                 et->et_frequency = count_freq;
979                 break;
980         default:
981                 panic("lapic_change_mode %d", newmode);
982         }
983         la->la_timer_mode = newmode;
984         et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
985         et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
986 }
987
988 static int
989 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
990 {
991         struct lapic *la;
992
993         la = &lapics[PCPU_GET(apic_id)];
994         if (et->et_frequency == 0) {
995                 lapic_calibrate_initcount(et, la);
996                 if (lapic_timer_tsc_deadline)
997                         lapic_calibrate_deadline(et, la);
998         }
999         if (period != 0) {
1000                 lapic_change_mode(et, la, LAT_MODE_PERIODIC);
1001                 la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
1002                     32;
1003                 lapic_timer_periodic(la);
1004         } else if (lapic_timer_tsc_deadline) {
1005                 lapic_change_mode(et, la, LAT_MODE_DEADLINE);
1006                 la->la_timer_period = (et->et_frequency * first) >> 32;
1007                 lapic_timer_deadline(la);
1008         } else {
1009                 lapic_change_mode(et, la, LAT_MODE_ONESHOT);
1010                 la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
1011                     32;
1012                 lapic_timer_oneshot(la);
1013         }
1014         return (0);
1015 }
1016
1017 static int
1018 lapic_et_stop(struct eventtimer *et)
1019 {
1020         struct lapic *la;
1021
1022         la = &lapics[PCPU_GET(apic_id)];
1023         lapic_timer_stop(la);
1024         la->la_timer_mode = LAT_MODE_UNDEF;
1025         return (0);
1026 }
1027
1028 static void
1029 native_lapic_disable(void)
1030 {
1031         uint32_t value;
1032
1033         /* Software disable the local APIC. */
1034         value = lapic_read32(LAPIC_SVR);
1035         value &= ~APIC_SVR_SWEN;
1036         lapic_write32(LAPIC_SVR, value);
1037 }
1038
1039 static void
1040 lapic_enable(void)
1041 {
1042         uint32_t value;
1043
1044         /* Program the spurious vector to enable the local APIC. */
1045         value = lapic_read32(LAPIC_SVR);
1046         value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
1047         value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
1048         if (lapic_eoi_suppression)
1049                 value |= APIC_SVR_EOI_SUPPRESSION;
1050         lapic_write32(LAPIC_SVR, value);
1051 }
1052
1053 /* Reset the local APIC on the BSP during resume. */
1054 static void
1055 lapic_resume(struct pic *pic, bool suspend_cancelled)
1056 {
1057
1058         lapic_setup(0);
1059 }
1060
1061 static int
1062 native_lapic_id(void)
1063 {
1064         uint32_t v;
1065
1066         KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
1067         v = lapic_read32(LAPIC_ID);
1068         if (!x2apic_mode)
1069                 v >>= APIC_ID_SHIFT;
1070         return (v);
1071 }
1072
1073 static int
1074 native_lapic_intr_pending(u_int vector)
1075 {
1076         uint32_t irr;
1077
1078         /*
1079          * The IRR registers are an array of registers each of which
1080          * only describes 32 interrupts in the low 32 bits.  Thus, we
1081          * divide the vector by 32 to get the register index.
1082          * Finally, we modulus the vector by 32 to determine the
1083          * individual bit to test.
1084          */
1085         irr = lapic_read32(LAPIC_IRR0 + vector / 32);
1086         return (irr & 1 << (vector % 32));
1087 }
1088
1089 static void
1090 native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
1091 {
1092         struct lapic *la;
1093
1094         KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist",
1095             __func__, apic_id));
1096         KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big",
1097             __func__, cluster));
1098         KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID,
1099             ("%s: intra cluster id %u too big", __func__, cluster_id));
1100         la = &lapics[apic_id];
1101         la->la_cluster = cluster;
1102         la->la_cluster_id = cluster_id;
1103 }
1104
1105 static int
1106 native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
1107 {
1108
1109         if (pin > APIC_LVT_MAX)
1110                 return (EINVAL);
1111         if (apic_id == APIC_ID_ALL) {
1112                 lvts[pin].lvt_masked = masked;
1113                 if (bootverbose)
1114                         printf("lapic:");
1115         } else {
1116                 KASSERT(lapics[apic_id].la_present,
1117                     ("%s: missing APIC %u", __func__, apic_id));
1118                 lapics[apic_id].la_lvts[pin].lvt_masked = masked;
1119                 lapics[apic_id].la_lvts[pin].lvt_active = 1;
1120                 if (bootverbose)
1121                         printf("lapic%u:", apic_id);
1122         }
1123         if (bootverbose)
1124                 printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked");
1125         return (0);
1126 }
1127
1128 static int
1129 native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
1130 {
1131         struct lvt *lvt;
1132
1133         if (pin > APIC_LVT_MAX)
1134                 return (EINVAL);
1135         if (apic_id == APIC_ID_ALL) {
1136                 lvt = &lvts[pin];
1137                 if (bootverbose)
1138                         printf("lapic:");
1139         } else {
1140                 KASSERT(lapics[apic_id].la_present,
1141                     ("%s: missing APIC %u", __func__, apic_id));
1142                 lvt = &lapics[apic_id].la_lvts[pin];
1143                 lvt->lvt_active = 1;
1144                 if (bootverbose)
1145                         printf("lapic%u:", apic_id);
1146         }
1147         lvt->lvt_mode = mode;
1148         switch (mode) {
1149         case APIC_LVT_DM_NMI:
1150         case APIC_LVT_DM_SMI:
1151         case APIC_LVT_DM_INIT:
1152         case APIC_LVT_DM_EXTINT:
1153                 lvt->lvt_edgetrigger = 1;
1154                 lvt->lvt_activehi = 1;
1155                 if (mode == APIC_LVT_DM_EXTINT)
1156                         lvt->lvt_masked = 1;
1157                 else
1158                         lvt->lvt_masked = 0;
1159                 break;
1160         default:
1161                 panic("Unsupported delivery mode: 0x%x\n", mode);
1162         }
1163         if (bootverbose) {
1164                 printf(" Routing ");
1165                 switch (mode) {
1166                 case APIC_LVT_DM_NMI:
1167                         printf("NMI");
1168                         break;
1169                 case APIC_LVT_DM_SMI:
1170                         printf("SMI");
1171                         break;
1172                 case APIC_LVT_DM_INIT:
1173                         printf("INIT");
1174                         break;
1175                 case APIC_LVT_DM_EXTINT:
1176                         printf("ExtINT");
1177                         break;
1178                 }
1179                 printf(" -> LINT%u\n", pin);
1180         }
1181         return (0);
1182 }
1183
1184 static int
1185 native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
1186 {
1187
1188         if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
1189                 return (EINVAL);
1190         if (apic_id == APIC_ID_ALL) {
1191                 lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH);
1192                 if (bootverbose)
1193                         printf("lapic:");
1194         } else {
1195                 KASSERT(lapics[apic_id].la_present,
1196                     ("%s: missing APIC %u", __func__, apic_id));
1197                 lapics[apic_id].la_lvts[pin].lvt_active = 1;
1198                 lapics[apic_id].la_lvts[pin].lvt_activehi =
1199                     (pol == INTR_POLARITY_HIGH);
1200                 if (bootverbose)
1201                         printf("lapic%u:", apic_id);
1202         }
1203         if (bootverbose)
1204                 printf(" LINT%u polarity: %s\n", pin,
1205                     pol == INTR_POLARITY_HIGH ? "high" : "low");
1206         return (0);
1207 }
1208
1209 static int
1210 native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
1211      enum intr_trigger trigger)
1212 {
1213
1214         if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
1215                 return (EINVAL);
1216         if (apic_id == APIC_ID_ALL) {
1217                 lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
1218                 if (bootverbose)
1219                         printf("lapic:");
1220         } else {
1221                 KASSERT(lapics[apic_id].la_present,
1222                     ("%s: missing APIC %u", __func__, apic_id));
1223                 lapics[apic_id].la_lvts[pin].lvt_edgetrigger =
1224                     (trigger == INTR_TRIGGER_EDGE);
1225                 lapics[apic_id].la_lvts[pin].lvt_active = 1;
1226                 if (bootverbose)
1227                         printf("lapic%u:", apic_id);
1228         }
1229         if (bootverbose)
1230                 printf(" LINT%u trigger: %s\n", pin,
1231                     trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
1232         return (0);
1233 }
1234
1235 /*
1236  * Adjust the TPR of the current CPU so that it blocks all interrupts below
1237  * the passed in vector.
1238  */
1239 static void
1240 lapic_set_tpr(u_int vector)
1241 {
1242 #ifdef CHEAP_TPR
1243         lapic_write32(LAPIC_TPR, vector);
1244 #else
1245         uint32_t tpr;
1246
1247         tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
1248         tpr |= vector;
1249         lapic_write32(LAPIC_TPR, tpr);
1250 #endif
1251 }
1252
1253 static void
1254 native_lapic_eoi(void)
1255 {
1256
1257         lapic_write32_nofence(LAPIC_EOI, 0);
1258 }
1259
1260 void
1261 lapic_handle_intr(int vector, struct trapframe *frame)
1262 {
1263         struct intsrc *isrc;
1264
1265         isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id),
1266             vector));
1267         intr_execute_handlers(isrc, frame);
1268 }
1269
1270 void
1271 lapic_handle_timer(struct trapframe *frame)
1272 {
1273         struct lapic *la;
1274         struct trapframe *oldframe;
1275         struct thread *td;
1276
1277         /* Send EOI first thing. */
1278         lapic_eoi();
1279
1280 #if defined(SMP) && !defined(SCHED_ULE)
1281         /*
1282          * Don't do any accounting for the disabled HTT cores, since it
1283          * will provide misleading numbers for the userland.
1284          *
1285          * No locking is necessary here, since even if we lose the race
1286          * when hlt_cpus_mask changes it is not a big deal, really.
1287          *
1288          * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask
1289          * and unlike other schedulers it actually schedules threads to
1290          * those CPUs.
1291          */
1292         if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask))
1293                 return;
1294 #endif
1295
1296         /* Look up our local APIC structure for the tick counters. */
1297         la = &lapics[PCPU_GET(apic_id)];
1298         (*la->la_timer_count)++;
1299         critical_enter();
1300         if (lapic_et.et_active) {
1301                 td = curthread;
1302                 td->td_intr_nesting_level++;
1303                 oldframe = td->td_intr_frame;
1304                 td->td_intr_frame = frame;
1305                 lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg);
1306                 td->td_intr_frame = oldframe;
1307                 td->td_intr_nesting_level--;
1308         }
1309         critical_exit();
1310 }
1311
1312 static void
1313 lapic_timer_set_divisor(u_int divisor)
1314 {
1315
1316         KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
1317         KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
1318                 ("lapic: invalid divisor %u", divisor));
1319         lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
1320 }
1321
1322 static void
1323 lapic_timer_oneshot(struct lapic *la)
1324 {
1325         uint32_t value;
1326
1327         value = la->lvt_timer_base;
1328         value &= ~(APIC_LVTT_TM | APIC_LVT_M);
1329         value |= APIC_LVTT_TM_ONE_SHOT;
1330         la->lvt_timer_last = value;
1331         lapic_write32(LAPIC_LVT_TIMER, value);
1332         lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
1333 }
1334
1335 static void
1336 lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
1337 {
1338         uint32_t value;
1339
1340         value = la->lvt_timer_base;
1341         value &= ~APIC_LVTT_TM;
1342         value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
1343         la->lvt_timer_last = value;
1344         lapic_write32(LAPIC_LVT_TIMER, value);
1345         lapic_write32(LAPIC_ICR_TIMER, count);
1346 }
1347
1348 static void
1349 lapic_timer_periodic(struct lapic *la)
1350 {
1351         uint32_t value;
1352
1353         value = la->lvt_timer_base;
1354         value &= ~(APIC_LVTT_TM | APIC_LVT_M);
1355         value |= APIC_LVTT_TM_PERIODIC;
1356         la->lvt_timer_last = value;
1357         lapic_write32(LAPIC_LVT_TIMER, value);
1358         lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
1359 }
1360
1361 static void
1362 lapic_timer_deadline(struct lapic *la)
1363 {
1364         uint32_t value;
1365
1366         value = la->lvt_timer_base;
1367         value &= ~(APIC_LVTT_TM | APIC_LVT_M);
1368         value |= APIC_LVTT_TM_TSCDLT;
1369         if (value != la->lvt_timer_last) {
1370                 la->lvt_timer_last = value;
1371                 lapic_write32_nofence(LAPIC_LVT_TIMER, value);
1372                 if (!x2apic_mode)
1373                         mfence();
1374         }
1375         wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
1376 }
1377
1378 static void
1379 lapic_timer_stop(struct lapic *la)
1380 {
1381         uint32_t value;
1382
1383         if (la->la_timer_mode == LAT_MODE_DEADLINE) {
1384                 wrmsr(MSR_TSC_DEADLINE, 0);
1385                 mfence();
1386         } else {
1387                 value = la->lvt_timer_base;
1388                 value &= ~APIC_LVTT_TM;
1389                 value |= APIC_LVT_M;
1390                 la->lvt_timer_last = value;
1391                 lapic_write32(LAPIC_LVT_TIMER, value);
1392         }
1393 }
1394
1395 void
1396 lapic_handle_cmc(void)
1397 {
1398
1399         lapic_eoi();
1400         cmc_intr();
1401 }
1402
1403 /*
1404  * Called from the mca_init() to activate the CMC interrupt if this CPU is
1405  * responsible for monitoring any MC banks for CMC events.  Since mca_init()
1406  * is called prior to lapic_setup() during boot, this just needs to unmask
1407  * this CPU's LVT_CMCI entry.
1408  */
1409 static void
1410 native_lapic_enable_cmc(void)
1411 {
1412         u_int apic_id;
1413
1414 #ifdef DEV_ATPIC
1415         if (!x2apic_mode && lapic_map == NULL)
1416                 return;
1417 #endif
1418         apic_id = PCPU_GET(apic_id);
1419         KASSERT(lapics[apic_id].la_present,
1420             ("%s: missing APIC %u", __func__, apic_id));
1421         lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_masked = 0;
1422         lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_active = 1;
1423         if (bootverbose)
1424                 printf("lapic%u: CMCI unmasked\n", apic_id);
1425 }
1426
1427 static int
1428 native_lapic_enable_mca_elvt(void)
1429 {
1430         u_int apic_id;
1431         uint32_t value;
1432         int elvt_count;
1433
1434 #ifdef DEV_ATPIC
1435         if (lapic_map == NULL)
1436                 return (-1);
1437 #endif
1438
1439         apic_id = PCPU_GET(apic_id);
1440         KASSERT(lapics[apic_id].la_present,
1441             ("%s: missing APIC %u", __func__, apic_id));
1442         elvt_count = amd_read_elvt_count();
1443         if (elvt_count <= APIC_ELVT_MCA)
1444                 return (-1);
1445
1446         value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
1447         if ((value & APIC_LVT_M) == 0) {
1448                 if (bootverbose)
1449                         printf("AMD MCE Thresholding Extended LVT is already active\n");
1450                 return (APIC_ELVT_MCA);
1451         }
1452         lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
1453         lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
1454         if (bootverbose)
1455                 printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id);
1456         return (APIC_ELVT_MCA);
1457 }
1458
1459 void
1460 lapic_handle_error(void)
1461 {
1462         uint32_t esr;
1463
1464         /*
1465          * Read the contents of the error status register.  Write to
1466          * the register first before reading from it to force the APIC
1467          * to update its value to indicate any errors that have
1468          * occurred since the previous write to the register.
1469          */
1470         lapic_write32(LAPIC_ESR, 0);
1471         esr = lapic_read32(LAPIC_ESR);
1472
1473         printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
1474         lapic_eoi();
1475 }
1476
1477 static u_int
1478 native_apic_cpuid(u_int apic_id)
1479 {
1480 #ifdef SMP
1481         return apic_cpuids[apic_id];
1482 #else
1483         return 0;
1484 #endif
1485 }
1486
1487 /* Request a free IDT vector to be used by the specified IRQ. */
1488 static u_int
1489 native_apic_alloc_vector(u_int apic_id, u_int irq)
1490 {
1491         u_int vector;
1492
1493         KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
1494
1495         /*
1496          * Search for a free vector.  Currently we just use a very simple
1497          * algorithm to find the first free vector.
1498          */
1499         mtx_lock_spin(&icu_lock);
1500         for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
1501                 if (lapics[apic_id].la_ioint_irqs[vector] != -1)
1502                         continue;
1503                 lapics[apic_id].la_ioint_irqs[vector] = irq;
1504                 mtx_unlock_spin(&icu_lock);
1505                 return (vector + APIC_IO_INTS);
1506         }
1507         mtx_unlock_spin(&icu_lock);
1508         return (0);
1509 }
1510
1511 /*
1512  * Request 'count' free contiguous IDT vectors to be used by 'count'
1513  * IRQs.  'count' must be a power of two and the vectors will be
1514  * aligned on a boundary of 'align'.  If the request cannot be
1515  * satisfied, 0 is returned.
1516  */
1517 static u_int
1518 native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
1519 {
1520         u_int first, run, vector;
1521
1522         KASSERT(powerof2(count), ("bad count"));
1523         KASSERT(powerof2(align), ("bad align"));
1524         KASSERT(align >= count, ("align < count"));
1525 #ifdef INVARIANTS
1526         for (run = 0; run < count; run++)
1527                 KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
1528                     irqs[run], run));
1529 #endif
1530
1531         /*
1532          * Search for 'count' free vectors.  As with apic_alloc_vector(),
1533          * this just uses a simple first fit algorithm.
1534          */
1535         run = 0;
1536         first = 0;
1537         mtx_lock_spin(&icu_lock);
1538         for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
1539
1540                 /* Vector is in use, end run. */
1541                 if (lapics[apic_id].la_ioint_irqs[vector] != -1) {
1542                         run = 0;
1543                         first = 0;
1544                         continue;
1545                 }
1546
1547                 /* Start a new run if run == 0 and vector is aligned. */
1548                 if (run == 0) {
1549                         if ((vector & (align - 1)) != 0)
1550                                 continue;
1551                         first = vector;
1552                 }
1553                 run++;
1554
1555                 /* Keep looping if the run isn't long enough yet. */
1556                 if (run < count)
1557                         continue;
1558
1559                 /* Found a run, assign IRQs and return the first vector. */
1560                 for (vector = 0; vector < count; vector++)
1561                         lapics[apic_id].la_ioint_irqs[first + vector] =
1562                             irqs[vector];
1563                 mtx_unlock_spin(&icu_lock);
1564                 return (first + APIC_IO_INTS);
1565         }
1566         mtx_unlock_spin(&icu_lock);
1567         printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count);
1568         return (0);
1569 }
1570
1571 /*
1572  * Enable a vector for a particular apic_id.  Since all lapics share idt
1573  * entries and ioint_handlers this enables the vector on all lapics.  lapics
1574  * which do not have the vector configured would report spurious interrupts
1575  * should it fire.
1576  */
1577 static void
1578 native_apic_enable_vector(u_int apic_id, u_int vector)
1579 {
1580
1581         KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
1582         KASSERT(ioint_handlers[vector / 32] != NULL,
1583             ("No ISR handler for vector %u", vector));
1584 #ifdef KDTRACE_HOOKS
1585         KASSERT(vector != IDT_DTRACE_RET,
1586             ("Attempt to overwrite DTrace entry"));
1587 #endif
1588         setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
1589             SDT_APIC, SEL_KPL, GSEL_APIC);
1590 }
1591
1592 static void
1593 native_apic_disable_vector(u_int apic_id, u_int vector)
1594 {
1595
1596         KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
1597 #ifdef KDTRACE_HOOKS
1598         KASSERT(vector != IDT_DTRACE_RET,
1599             ("Attempt to overwrite DTrace entry"));
1600 #endif
1601         KASSERT(ioint_handlers[vector / 32] != NULL,
1602             ("No ISR handler for vector %u", vector));
1603 #ifdef notyet
1604         /*
1605          * We can not currently clear the idt entry because other cpus
1606          * may have a valid vector at this offset.
1607          */
1608         setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
1609             SEL_KPL, GSEL_APIC);
1610 #endif
1611 }
1612
1613 /* Release an APIC vector when it's no longer in use. */
1614 static void
1615 native_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
1616 {
1617         struct thread *td;
1618
1619         KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
1620             vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
1621             ("Vector %u does not map to an IRQ line", vector));
1622         KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
1623         KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
1624             irq, ("IRQ mismatch"));
1625 #ifdef KDTRACE_HOOKS
1626         KASSERT(vector != IDT_DTRACE_RET,
1627             ("Attempt to overwrite DTrace entry"));
1628 #endif
1629
1630         /*
1631          * Bind us to the cpu that owned the vector before freeing it so
1632          * we don't lose an interrupt delivery race.
1633          */
1634         td = curthread;
1635         if (!rebooting) {
1636                 thread_lock(td);
1637                 if (sched_is_bound(td))
1638                         panic("apic_free_vector: Thread already bound.\n");
1639                 sched_bind(td, apic_cpuid(apic_id));
1640                 thread_unlock(td);
1641         }
1642         mtx_lock_spin(&icu_lock);
1643         lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
1644         mtx_unlock_spin(&icu_lock);
1645         if (!rebooting) {
1646                 thread_lock(td);
1647                 sched_unbind(td);
1648                 thread_unlock(td);
1649         }
1650 }
1651
1652 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
1653 static u_int
1654 apic_idt_to_irq(u_int apic_id, u_int vector)
1655 {
1656         int irq;
1657
1658         KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
1659             vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
1660             ("Vector %u does not map to an IRQ line", vector));
1661 #ifdef KDTRACE_HOOKS
1662         KASSERT(vector != IDT_DTRACE_RET,
1663             ("Attempt to overwrite DTrace entry"));
1664 #endif
1665         irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS];
1666         if (irq < 0)
1667                 irq = 0;
1668         return (irq);
1669 }
1670
1671 #ifdef DDB
1672 /*
1673  * Dump data about APIC IDT vector mappings.
1674  */
1675 DB_SHOW_COMMAND(apic, db_show_apic)
1676 {
1677         struct intsrc *isrc;
1678         int i, verbose;
1679         u_int apic_id;
1680         u_int irq;
1681
1682         if (strcmp(modif, "vv") == 0)
1683                 verbose = 2;
1684         else if (strcmp(modif, "v") == 0)
1685                 verbose = 1;
1686         else
1687                 verbose = 0;
1688         for (apic_id = 0; apic_id <= max_apic_id; apic_id++) {
1689                 if (lapics[apic_id].la_present == 0)
1690                         continue;
1691                 db_printf("Interrupts bound to lapic %u\n", apic_id);
1692                 for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
1693                         irq = lapics[apic_id].la_ioint_irqs[i];
1694                         if (irq == -1 || irq == IRQ_SYSCALL)
1695                                 continue;
1696 #ifdef KDTRACE_HOOKS
1697                         if (irq == IRQ_DTRACE_RET)
1698                                 continue;
1699 #endif
1700 #ifdef XENHVM
1701                         if (irq == IRQ_EVTCHN)
1702                                 continue;
1703 #endif
1704                         db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
1705                         if (irq == IRQ_TIMER)
1706                                 db_printf("lapic timer\n");
1707                         else if (irq < NUM_IO_INTS) {
1708                                 isrc = intr_lookup_source(irq);
1709                                 if (isrc == NULL || verbose == 0)
1710                                         db_printf("IRQ %u\n", irq);
1711                                 else
1712                                         db_dump_intr_event(isrc->is_event,
1713                                             verbose == 2);
1714                         } else
1715                                 db_printf("IRQ %u ???\n", irq);
1716                 }
1717         }
1718 }
1719
1720 static void
1721 dump_mask(const char *prefix, uint32_t v, int base)
1722 {
1723         int i, first;
1724
1725         first = 1;
1726         for (i = 0; i < 32; i++)
1727                 if (v & (1 << i)) {
1728                         if (first) {
1729                                 db_printf("%s:", prefix);
1730                                 first = 0;
1731                         }
1732                         db_printf(" %02x", base + i);
1733                 }
1734         if (!first)
1735                 db_printf("\n");
1736 }
1737
1738 /* Show info from the lapic regs for this CPU. */
1739 DB_SHOW_COMMAND(lapic, db_show_lapic)
1740 {
1741         uint32_t v;
1742
1743         db_printf("lapic ID = %d\n", lapic_id());
1744         v = lapic_read32(LAPIC_VERSION);
1745         db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
1746             v & 0xf);
1747         db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
1748         v = lapic_read32(LAPIC_SVR);
1749         db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
1750             v & APIC_SVR_ENABLE ? "enabled" : "disabled");
1751         db_printf("TPR      = %02x\n", lapic_read32(LAPIC_TPR));
1752
1753 #define dump_field(prefix, regn, index)                                 \
1754         dump_mask(__XSTRING(prefix ## index),                           \
1755             lapic_read32(LAPIC_ ## regn ## index),                      \
1756             index * 32)
1757
1758         db_printf("In-service Interrupts:\n");
1759         dump_field(isr, ISR, 0);
1760         dump_field(isr, ISR, 1);
1761         dump_field(isr, ISR, 2);
1762         dump_field(isr, ISR, 3);
1763         dump_field(isr, ISR, 4);
1764         dump_field(isr, ISR, 5);
1765         dump_field(isr, ISR, 6);
1766         dump_field(isr, ISR, 7);
1767
1768         db_printf("TMR Interrupts:\n");
1769         dump_field(tmr, TMR, 0);
1770         dump_field(tmr, TMR, 1);
1771         dump_field(tmr, TMR, 2);
1772         dump_field(tmr, TMR, 3);
1773         dump_field(tmr, TMR, 4);
1774         dump_field(tmr, TMR, 5);
1775         dump_field(tmr, TMR, 6);
1776         dump_field(tmr, TMR, 7);
1777
1778         db_printf("IRR Interrupts:\n");
1779         dump_field(irr, IRR, 0);
1780         dump_field(irr, IRR, 1);
1781         dump_field(irr, IRR, 2);
1782         dump_field(irr, IRR, 3);
1783         dump_field(irr, IRR, 4);
1784         dump_field(irr, IRR, 5);
1785         dump_field(irr, IRR, 6);
1786         dump_field(irr, IRR, 7);
1787
1788 #undef dump_field
1789 }
1790 #endif
1791
1792 /*
1793  * APIC probing support code.  This includes code to manage enumerators.
1794  */
1795
1796 static SLIST_HEAD(, apic_enumerator) enumerators =
1797         SLIST_HEAD_INITIALIZER(enumerators);
1798 static struct apic_enumerator *best_enum;
1799
1800 void
1801 apic_register_enumerator(struct apic_enumerator *enumerator)
1802 {
1803 #ifdef INVARIANTS
1804         struct apic_enumerator *apic_enum;
1805
1806         SLIST_FOREACH(apic_enum, &enumerators, apic_next) {
1807                 if (apic_enum == enumerator)
1808                         panic("%s: Duplicate register of %s", __func__,
1809                             enumerator->apic_name);
1810         }
1811 #endif
1812         SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next);
1813 }
1814
1815 /*
1816  * We have to look for CPU's very, very early because certain subsystems
1817  * want to know how many CPU's we have extremely early on in the boot
1818  * process.
1819  */
1820 static void
1821 apic_init(void *dummy __unused)
1822 {
1823         struct apic_enumerator *enumerator;
1824         int retval, best;
1825
1826         /* We only support built in local APICs. */
1827         if (!(cpu_feature & CPUID_APIC))
1828                 return;
1829
1830         /* Don't probe if APIC mode is disabled. */
1831         if (resource_disabled("apic", 0))
1832                 return;
1833
1834         /* Probe all the enumerators to find the best match. */
1835         best_enum = NULL;
1836         best = 0;
1837         SLIST_FOREACH(enumerator, &enumerators, apic_next) {
1838                 retval = enumerator->apic_probe();
1839                 if (retval > 0)
1840                         continue;
1841                 if (best_enum == NULL || best < retval) {
1842                         best_enum = enumerator;
1843                         best = retval;
1844                 }
1845         }
1846         if (best_enum == NULL) {
1847                 if (bootverbose)
1848                         printf("APIC: Could not find any APICs.\n");
1849 #ifndef DEV_ATPIC
1850                 panic("running without device atpic requires a local APIC");
1851 #endif
1852                 return;
1853         }
1854
1855         if (bootverbose)
1856                 printf("APIC: Using the %s enumerator.\n",
1857                     best_enum->apic_name);
1858
1859 #ifdef I686_CPU
1860         /*
1861          * To work around an errata, we disable the local APIC on some
1862          * CPUs during early startup.  We need to turn the local APIC back
1863          * on on such CPUs now.
1864          */
1865         ppro_reenable_apic();
1866 #endif
1867
1868         /* Probe the CPU's in the system. */
1869         retval = best_enum->apic_probe_cpus();
1870         if (retval != 0)
1871                 printf("%s: Failed to probe CPUs: returned %d\n",
1872                     best_enum->apic_name, retval);
1873
1874 }
1875 SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL);
1876
1877 /*
1878  * Setup the local APIC.  We have to do this prior to starting up the APs
1879  * in the SMP case.
1880  */
1881 static void
1882 apic_setup_local(void *dummy __unused)
1883 {
1884         int retval;
1885
1886         if (best_enum == NULL)
1887                 return;
1888
1889         lapics = malloc(sizeof(*lapics) * (max_apic_id + 1), M_LAPIC,
1890             M_WAITOK | M_ZERO);
1891
1892         /* Initialize the local APIC. */
1893         retval = best_enum->apic_setup_local();
1894         if (retval != 0)
1895                 printf("%s: Failed to setup the local APIC: returned %d\n",
1896                     best_enum->apic_name, retval);
1897 }
1898 SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL);
1899
1900 /*
1901  * Setup the I/O APICs.
1902  */
1903 static void
1904 apic_setup_io(void *dummy __unused)
1905 {
1906         int retval;
1907
1908         if (best_enum == NULL)
1909                 return;
1910
1911         /*
1912          * Local APIC must be registered before other PICs and pseudo PICs
1913          * for proper suspend/resume order.
1914          */
1915         intr_register_pic(&lapic_pic);
1916
1917         retval = best_enum->apic_setup_io();
1918         if (retval != 0)
1919                 printf("%s: Failed to setup I/O APICs: returned %d\n",
1920                     best_enum->apic_name, retval);
1921
1922         /*
1923          * Finish setting up the local APIC on the BSP once we know
1924          * how to properly program the LINT pins.  In particular, this
1925          * enables the EOI suppression mode, if LAPIC support it and
1926          * user did not disabled the mode.
1927          */
1928         lapic_setup(1);
1929         if (bootverbose)
1930                 lapic_dump("BSP");
1931
1932         /* Enable the MSI "pic". */
1933         init_ops.msi_init();
1934 }
1935 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
1936
1937 #ifdef SMP
1938 /*
1939  * Inter Processor Interrupt functions.  The lapic_ipi_*() functions are
1940  * private to the MD code.  The public interface for the rest of the
1941  * kernel is defined in mp_machdep.c.
1942  */
1943
1944 /*
1945  * Wait delay microseconds for IPI to be sent.  If delay is -1, we
1946  * wait forever.
1947  */
1948 static int
1949 native_lapic_ipi_wait(int delay)
1950 {
1951         uint64_t rx;
1952
1953         /* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
1954         if (x2apic_mode)
1955                 return (1);
1956
1957         for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
1958                 if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
1959                     APIC_DELSTAT_IDLE)
1960                         return (1);
1961                 ia32_pause();
1962         }
1963         return (0);
1964 }
1965
1966 static void
1967 native_lapic_ipi_raw(register_t icrlo, u_int dest)
1968 {
1969         uint64_t icr;
1970         uint32_t vhi, vlo;
1971         register_t saveintr;
1972
1973         /* XXX: Need more sanity checking of icrlo? */
1974         KASSERT(x2apic_mode || lapic_map != NULL,
1975             ("%s called too early", __func__));
1976         KASSERT(x2apic_mode ||
1977             (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
1978             ("%s: invalid dest field", __func__));
1979         KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
1980             ("%s: reserved bits set in ICR LO register", __func__));
1981
1982         /* Set destination in ICR HI register if it is being used. */
1983         if (!x2apic_mode) {
1984                 saveintr = intr_disable();
1985                 icr = lapic_read_icr();
1986         }
1987
1988         if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
1989                 if (x2apic_mode) {
1990                         vhi = dest;
1991                 } else {
1992                         vhi = icr >> 32;
1993                         vhi &= ~APIC_ID_MASK;
1994                         vhi |= dest << APIC_ID_SHIFT;
1995                 }
1996         } else {
1997                 vhi = 0;
1998         }
1999
2000         /* Program the contents of the IPI and dispatch it. */
2001         if (x2apic_mode) {
2002                 vlo = icrlo;
2003         } else {
2004                 vlo = icr;
2005                 vlo &= APIC_ICRLO_RESV_MASK;
2006                 vlo |= icrlo;
2007         }
2008         lapic_write_icr(vhi, vlo);
2009         if (!x2apic_mode)
2010                 intr_restore(saveintr);
2011 }
2012
2013 #define BEFORE_SPIN     50000
2014 #ifdef DETECT_DEADLOCK
2015 #define AFTER_SPIN      50
2016 #endif
2017
2018 static void
2019 native_lapic_ipi_vectored(u_int vector, int dest)
2020 {
2021         register_t icrlo, destfield;
2022
2023         KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
2024             ("%s: invalid vector %d", __func__, vector));
2025
2026         icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
2027
2028         /*
2029          * NMI IPIs are just fake vectors used to send a NMI.  Use special rules
2030          * regarding NMIs if passed, otherwise specify the vector.
2031          */
2032         if (vector >= IPI_NMI_FIRST)
2033                 icrlo |= APIC_DELMODE_NMI;
2034         else
2035                 icrlo |= vector | APIC_DELMODE_FIXED;
2036         destfield = 0;
2037         switch (dest) {
2038         case APIC_IPI_DEST_SELF:
2039                 icrlo |= APIC_DEST_SELF;
2040                 break;
2041         case APIC_IPI_DEST_ALL:
2042                 icrlo |= APIC_DEST_ALLISELF;
2043                 break;
2044         case APIC_IPI_DEST_OTHERS:
2045                 icrlo |= APIC_DEST_ALLESELF;
2046                 break;
2047         default:
2048                 KASSERT(x2apic_mode ||
2049                     (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
2050                     ("%s: invalid destination 0x%x", __func__, dest));
2051                 destfield = dest;
2052         }
2053
2054         /* Wait for an earlier IPI to finish. */
2055         if (!lapic_ipi_wait(BEFORE_SPIN)) {
2056                 if (panicstr != NULL)
2057                         return;
2058                 else
2059                         panic("APIC: Previous IPI is stuck");
2060         }
2061
2062         lapic_ipi_raw(icrlo, destfield);
2063
2064 #ifdef DETECT_DEADLOCK
2065         /* Wait for IPI to be delivered. */
2066         if (!lapic_ipi_wait(AFTER_SPIN)) {
2067 #ifdef needsattention
2068                 /*
2069                  * XXX FIXME:
2070                  *
2071                  * The above function waits for the message to actually be
2072                  * delivered.  It breaks out after an arbitrary timeout
2073                  * since the message should eventually be delivered (at
2074                  * least in theory) and that if it wasn't we would catch
2075                  * the failure with the check above when the next IPI is
2076                  * sent.
2077                  *
2078                  * We could skip this wait entirely, EXCEPT it probably
2079                  * protects us from other routines that assume that the
2080                  * message was delivered and acted upon when this function
2081                  * returns.
2082                  */
2083                 printf("APIC: IPI might be stuck\n");
2084 #else /* !needsattention */
2085                 /* Wait until mesage is sent without a timeout. */
2086                 while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
2087                         ia32_pause();
2088 #endif /* needsattention */
2089         }
2090 #endif /* DETECT_DEADLOCK */
2091 }
2092
2093 #endif /* SMP */
2094
2095 /*
2096  * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
2097  * visible.
2098  *
2099  * Consider the case where an IPI is generated immediately after allocation:
2100  *     vector = lapic_ipi_alloc(ipifunc);
2101  *     ipi_selected(other_cpus, vector);
2102  *
2103  * In xAPIC mode a write to ICR_LO has serializing semantics because the
2104  * APIC page is mapped as an uncached region. In x2APIC mode there is an
2105  * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
2106  * the IDT slot update is globally visible before the IPI is delivered.
2107  */
2108 static int
2109 native_lapic_ipi_alloc(inthand_t *ipifunc)
2110 {
2111         struct gate_descriptor *ip;
2112         long func;
2113         int idx, vector;
2114
2115         KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
2116             ("invalid ipifunc %p", ipifunc));
2117
2118         vector = -1;
2119         mtx_lock_spin(&icu_lock);
2120         for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
2121                 ip = &idt[idx];
2122                 func = (ip->gd_hioffset << 16) | ip->gd_looffset;
2123                 if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
2124                     (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
2125                         vector = idx;
2126                         setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
2127                         break;
2128                 }
2129         }
2130         mtx_unlock_spin(&icu_lock);
2131         return (vector);
2132 }
2133
2134 static void
2135 native_lapic_ipi_free(int vector)
2136 {
2137         struct gate_descriptor *ip;
2138         long func;
2139
2140         KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
2141             ("%s: invalid vector %d", __func__, vector));
2142
2143         mtx_lock_spin(&icu_lock);
2144         ip = &idt[vector];
2145         func = (ip->gd_hioffset << 16) | ip->gd_looffset;
2146         KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
2147             func != (uintptr_t)&IDTVEC(rsvd_pti),
2148             ("invalid idtfunc %#lx", func));
2149         setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
2150             SEL_KPL, GSEL_APIC);
2151         mtx_unlock_spin(&icu_lock);
2152 }