]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/x86/x86/local_apic.c
MFC 338360,338415,338624,338630,338631,338725: Dynamic x86 IRQ layout.
[FreeBSD/FreeBSD.git] / sys / x86 / x86 / local_apic.c
1 /*-
2  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
3  * Copyright (c) 1996, by Steve Passe
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. The name of the developer may NOT be used to endorse or promote products
12  *    derived from this software without specific prior written permission.
13  * 3. Neither the name of the author nor the names of any co-contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 /*
31  * Local APIC support on Pentium and later processors.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_atpic.h"
38 #include "opt_hwpmc_hooks.h"
39
40 #include "opt_ddb.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/bus.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/pcpu.h>
49 #include <sys/proc.h>
50 #include <sys/sched.h>
51 #include <sys/smp.h>
52 #include <sys/sysctl.h>
53 #include <sys/timeet.h>
54
55 #include <vm/vm.h>
56 #include <vm/pmap.h>
57
58 #include <x86/apicreg.h>
59 #include <machine/clock.h>
60 #include <machine/cpufunc.h>
61 #include <machine/cputypes.h>
62 #include <machine/frame.h>
63 #include <machine/intr_machdep.h>
64 #include <x86/apicvar.h>
65 #include <x86/mca.h>
66 #include <machine/md_var.h>
67 #include <machine/smp.h>
68 #include <machine/specialreg.h>
69 #include <x86/init.h>
70
71 #ifdef DDB
72 #include <sys/interrupt.h>
73 #include <ddb/ddb.h>
74 #endif
75
76 #ifdef __amd64__
77 #define SDT_APIC        SDT_SYSIGT
78 #define SDT_APICT       SDT_SYSIGT
79 #define GSEL_APIC       0
80 #else
81 #define SDT_APIC        SDT_SYS386IGT
82 #define SDT_APICT       SDT_SYS386TGT
83 #define GSEL_APIC       GSEL(GCODE_SEL, SEL_KPL)
84 #endif
85
86 /* Sanity checks on IDT vectors. */
87 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
88 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
89 CTASSERT(APIC_LOCAL_INTS == 240);
90 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
91
92 /*
93  * I/O interrupts use non-negative IRQ values.  These values are used
94  * to mark unused IDT entries or IDT entries reserved for a non-I/O
95  * interrupt.
96  */
97 #define IRQ_FREE        -1
98 #define IRQ_TIMER       -2
99 #define IRQ_SYSCALL     -3
100 #define IRQ_DTRACE_RET  -4
101 #define IRQ_EVTCHN      -5
102
103 enum lat_timer_mode {
104         LAT_MODE_UNDEF =        0,
105         LAT_MODE_PERIODIC =     1,
106         LAT_MODE_ONESHOT =      2,
107         LAT_MODE_DEADLINE =     3,
108 };
109
110 /*
111  * Support for local APICs.  Local APICs manage interrupts on each
112  * individual processor as opposed to I/O APICs which receive interrupts
113  * from I/O devices and then forward them on to the local APICs.
114  *
115  * Local APICs can also send interrupts to each other thus providing the
116  * mechanism for IPIs.
117  */
118
119 struct lvt {
120         u_int lvt_edgetrigger:1;
121         u_int lvt_activehi:1;
122         u_int lvt_masked:1;
123         u_int lvt_active:1;
124         u_int lvt_mode:16;
125         u_int lvt_vector:8;
126 };
127
128 struct lapic {
129         struct lvt la_lvts[APIC_LVT_MAX + 1];
130         struct lvt la_elvts[APIC_ELVT_MAX + 1];;
131         u_int la_id:8;
132         u_int la_cluster:4;
133         u_int la_cluster_id:2;
134         u_int la_present:1;
135         u_long *la_timer_count;
136         uint64_t la_timer_period;
137         enum lat_timer_mode la_timer_mode;
138         uint32_t lvt_timer_base;
139         uint32_t lvt_timer_last;
140         /* Include IDT_SYSCALL to make indexing easier. */
141         int la_ioint_irqs[APIC_NUM_IOINTS + 1];
142 } static lapics[MAX_APIC_ID + 1];
143
144 /* Global defaults for local APIC LVT entries. */
145 static struct lvt lvts[APIC_LVT_MAX + 1] = {
146         { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },  /* LINT0: masked ExtINT */
147         { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },     /* LINT1: NMI */
148         { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },      /* Timer */
149         { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },      /* Error */
150         { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },     /* PMC */
151         { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },    /* Thermal */
152         { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },        /* CMCI */
153 };
154
155 /* Global defaults for AMD local APIC ELVT entries. */
156 static struct lvt elvts[APIC_ELVT_MAX + 1] = {
157         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
158         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
159         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
160         { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
161 };
162
163 static inthand_t *ioint_handlers[] = {
164         NULL,                   /* 0 - 31 */
165         IDTVEC(apic_isr1),      /* 32 - 63 */
166         IDTVEC(apic_isr2),      /* 64 - 95 */
167         IDTVEC(apic_isr3),      /* 96 - 127 */
168         IDTVEC(apic_isr4),      /* 128 - 159 */
169         IDTVEC(apic_isr5),      /* 160 - 191 */
170         IDTVEC(apic_isr6),      /* 192 - 223 */
171         IDTVEC(apic_isr7),      /* 224 - 255 */
172 };
173
174 static inthand_t *ioint_pti_handlers[] = {
175         NULL,                   /* 0 - 31 */
176         IDTVEC(apic_isr1_pti),  /* 32 - 63 */
177         IDTVEC(apic_isr2_pti),  /* 64 - 95 */
178         IDTVEC(apic_isr3_pti),  /* 96 - 127 */
179         IDTVEC(apic_isr4_pti),  /* 128 - 159 */
180         IDTVEC(apic_isr5_pti),  /* 160 - 191 */
181         IDTVEC(apic_isr6_pti),  /* 192 - 223 */
182         IDTVEC(apic_isr7_pti),  /* 224 - 255 */
183 };
184
185 static u_int32_t lapic_timer_divisors[] = {
186         APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
187         APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
188 };
189
190 extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
191
192 volatile char *lapic_map;
193 vm_paddr_t lapic_paddr;
194 int x2apic_mode;
195 int lapic_eoi_suppression;
196 static int lapic_timer_tsc_deadline;
197 static u_long lapic_timer_divisor, count_freq;
198 static struct eventtimer lapic_et;
199 #ifdef SMP
200 static uint64_t lapic_ipi_wait_mult;
201 #endif
202
203 SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
204 SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
205 SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
206     &lapic_eoi_suppression, 0, "");
207 SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
208     &lapic_timer_tsc_deadline, 0, "");
209
210 static void lapic_calibrate_initcount(struct lapic *la);
211 static void lapic_calibrate_deadline(struct lapic *la);
212
213 static uint32_t
214 lapic_read32(enum LAPIC_REGISTERS reg)
215 {
216         uint32_t res;
217
218         if (x2apic_mode) {
219                 res = rdmsr32(MSR_APIC_000 + reg);
220         } else {
221                 res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
222         }
223         return (res);
224 }
225
226 static void
227 lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
228 {
229
230         if (x2apic_mode) {
231                 mfence();
232                 lfence();
233                 wrmsr(MSR_APIC_000 + reg, val);
234         } else {
235                 *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
236         }
237 }
238
239 static void
240 lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
241 {
242
243         if (x2apic_mode) {
244                 wrmsr(MSR_APIC_000 + reg, val);
245         } else {
246                 *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
247         }
248 }
249
250 #ifdef SMP
251 static uint64_t
252 lapic_read_icr(void)
253 {
254         uint64_t v;
255         uint32_t vhi, vlo;
256
257         if (x2apic_mode) {
258                 v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO);
259         } else {
260                 vhi = lapic_read32(LAPIC_ICR_HI);
261                 vlo = lapic_read32(LAPIC_ICR_LO);
262                 v = ((uint64_t)vhi << 32) | vlo;
263         }
264         return (v);
265 }
266
267 static uint64_t
268 lapic_read_icr_lo(void)
269 {
270
271         return (lapic_read32(LAPIC_ICR_LO));
272 }
273
274 static void
275 lapic_write_icr(uint32_t vhi, uint32_t vlo)
276 {
277         uint64_t v;
278
279         if (x2apic_mode) {
280                 v = ((uint64_t)vhi << 32) | vlo;
281                 mfence();
282                 wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
283         } else {
284                 lapic_write32(LAPIC_ICR_HI, vhi);
285                 lapic_write32(LAPIC_ICR_LO, vlo);
286         }
287 }
288 #endif /* SMP */
289
290 static void
291 native_lapic_enable_x2apic(void)
292 {
293         uint64_t apic_base;
294
295         apic_base = rdmsr(MSR_APICBASE);
296         apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
297         wrmsr(MSR_APICBASE, apic_base);
298 }
299
300 static bool
301 native_lapic_is_x2apic(void)
302 {
303         uint64_t apic_base;
304
305         apic_base = rdmsr(MSR_APICBASE);
306         return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
307             (APICBASE_X2APIC | APICBASE_ENABLED));
308 }
309
310 static void     lapic_enable(void);
311 static void     lapic_resume(struct pic *pic, bool suspend_cancelled);
312 static void     lapic_timer_oneshot(struct lapic *);
313 static void     lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
314 static void     lapic_timer_periodic(struct lapic *);
315 static void     lapic_timer_deadline(struct lapic *);
316 static void     lapic_timer_stop(struct lapic *);
317 static void     lapic_timer_set_divisor(u_int divisor);
318 static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value);
319 static int      lapic_et_start(struct eventtimer *et,
320                     sbintime_t first, sbintime_t period);
321 static int      lapic_et_stop(struct eventtimer *et);
322 static u_int    apic_idt_to_irq(u_int apic_id, u_int vector);
323 static void     lapic_set_tpr(u_int vector);
324
325 struct pic lapic_pic = { .pic_resume = lapic_resume };
326
327 /* Forward declarations for apic_ops */
328 static void     native_lapic_create(u_int apic_id, int boot_cpu);
329 static void     native_lapic_init(vm_paddr_t addr);
330 static void     native_lapic_xapic_mode(void);
331 static void     native_lapic_setup(int boot);
332 static void     native_lapic_dump(const char *str);
333 static void     native_lapic_disable(void);
334 static void     native_lapic_eoi(void);
335 static int      native_lapic_id(void);
336 static int      native_lapic_intr_pending(u_int vector);
337 static u_int    native_apic_cpuid(u_int apic_id);
338 static u_int    native_apic_alloc_vector(u_int apic_id, u_int irq);
339 static u_int    native_apic_alloc_vectors(u_int apic_id, u_int *irqs,
340                     u_int count, u_int align);
341 static void     native_apic_disable_vector(u_int apic_id, u_int vector);
342 static void     native_apic_enable_vector(u_int apic_id, u_int vector);
343 static void     native_apic_free_vector(u_int apic_id, u_int vector, u_int irq);
344 static void     native_lapic_set_logical_id(u_int apic_id, u_int cluster,
345                     u_int cluster_id);
346 static int      native_lapic_enable_pmc(void);
347 static void     native_lapic_disable_pmc(void);
348 static void     native_lapic_reenable_pmc(void);
349 static void     native_lapic_enable_cmc(void);
350 static int      native_lapic_enable_mca_elvt(void);
351 static int      native_lapic_set_lvt_mask(u_int apic_id, u_int lvt,
352                     u_char masked);
353 static int      native_lapic_set_lvt_mode(u_int apic_id, u_int lvt,
354                     uint32_t mode);
355 static int      native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
356                     enum intr_polarity pol);
357 static int      native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
358                     enum intr_trigger trigger);
359 #ifdef SMP
360 static void     native_lapic_ipi_raw(register_t icrlo, u_int dest);
361 static void     native_lapic_ipi_vectored(u_int vector, int dest);
362 static int      native_lapic_ipi_wait(int delay);
363 #endif /* SMP */
364 static int      native_lapic_ipi_alloc(inthand_t *ipifunc);
365 static void     native_lapic_ipi_free(int vector);
366
367 struct apic_ops apic_ops = {
368         .create                 = native_lapic_create,
369         .init                   = native_lapic_init,
370         .xapic_mode             = native_lapic_xapic_mode,
371         .is_x2apic              = native_lapic_is_x2apic,
372         .setup                  = native_lapic_setup,
373         .dump                   = native_lapic_dump,
374         .disable                = native_lapic_disable,
375         .eoi                    = native_lapic_eoi,
376         .id                     = native_lapic_id,
377         .intr_pending           = native_lapic_intr_pending,
378         .set_logical_id         = native_lapic_set_logical_id,
379         .cpuid                  = native_apic_cpuid,
380         .alloc_vector           = native_apic_alloc_vector,
381         .alloc_vectors          = native_apic_alloc_vectors,
382         .enable_vector          = native_apic_enable_vector,
383         .disable_vector         = native_apic_disable_vector,
384         .free_vector            = native_apic_free_vector,
385         .enable_pmc             = native_lapic_enable_pmc,
386         .disable_pmc            = native_lapic_disable_pmc,
387         .reenable_pmc           = native_lapic_reenable_pmc,
388         .enable_cmc             = native_lapic_enable_cmc,
389         .enable_mca_elvt        = native_lapic_enable_mca_elvt,
390 #ifdef SMP
391         .ipi_raw                = native_lapic_ipi_raw,
392         .ipi_vectored           = native_lapic_ipi_vectored,
393         .ipi_wait               = native_lapic_ipi_wait,
394 #endif
395         .ipi_alloc              = native_lapic_ipi_alloc,
396         .ipi_free               = native_lapic_ipi_free,
397         .set_lvt_mask           = native_lapic_set_lvt_mask,
398         .set_lvt_mode           = native_lapic_set_lvt_mode,
399         .set_lvt_polarity       = native_lapic_set_lvt_polarity,
400         .set_lvt_triggermode    = native_lapic_set_lvt_triggermode,
401 };
402
403 static uint32_t
404 lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
405 {
406
407         value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
408             APIC_LVT_VECTOR);
409         if (lvt->lvt_edgetrigger == 0)
410                 value |= APIC_LVT_TM;
411         if (lvt->lvt_activehi == 0)
412                 value |= APIC_LVT_IIPP_INTALO;
413         if (lvt->lvt_masked)
414                 value |= APIC_LVT_M;
415         value |= lvt->lvt_mode;
416         switch (lvt->lvt_mode) {
417         case APIC_LVT_DM_NMI:
418         case APIC_LVT_DM_SMI:
419         case APIC_LVT_DM_INIT:
420         case APIC_LVT_DM_EXTINT:
421                 if (!lvt->lvt_edgetrigger && bootverbose) {
422                         printf("lapic%u: Forcing LINT%u to edge trigger\n",
423                             la->la_id, pin);
424                         value &= ~APIC_LVT_TM;
425                 }
426                 /* Use a vector of 0. */
427                 break;
428         case APIC_LVT_DM_FIXED:
429                 value |= lvt->lvt_vector;
430                 break;
431         default:
432                 panic("bad APIC LVT delivery mode: %#x\n", value);
433         }
434         return (value);
435 }
436
437 static uint32_t
438 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
439 {
440         struct lvt *lvt;
441
442         KASSERT(pin <= APIC_LVT_MAX,
443             ("%s: pin %u out of range", __func__, pin));
444         if (la->la_lvts[pin].lvt_active)
445                 lvt = &la->la_lvts[pin];
446         else
447                 lvt = &lvts[pin];
448
449         return (lvt_mode_impl(la, lvt, pin, value));
450 }
451
452 static uint32_t
453 elvt_mode(struct lapic *la, u_int idx, uint32_t value)
454 {
455         struct lvt *elvt;
456
457         KASSERT(idx <= APIC_ELVT_MAX,
458             ("%s: idx %u out of range", __func__, idx));
459
460         elvt = &la->la_elvts[idx];
461         KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
462         KASSERT(elvt->lvt_edgetrigger,
463             ("%s: ELVT%u is not edge triggered", __func__, idx));
464         KASSERT(elvt->lvt_activehi,
465             ("%s: ELVT%u is not active high", __func__, idx));
466         return (lvt_mode_impl(la, elvt, idx, value));
467 }
468
469 /*
470  * Map the local APIC and setup necessary interrupt vectors.
471  */
472 static void
473 native_lapic_init(vm_paddr_t addr)
474 {
475 #ifdef SMP
476         uint64_t r, r1, r2, rx;
477 #endif
478         uint32_t ver;
479         u_int regs[4];
480         int i, arat;
481
482         /*
483          * Enable x2APIC mode if possible. Map the local APIC
484          * registers page.
485          *
486          * Keep the LAPIC registers page mapped uncached for x2APIC
487          * mode too, to have direct map page attribute set to
488          * uncached.  This is needed to work around CPU errata present
489          * on all Intel processors.
490          */
491         KASSERT(trunc_page(addr) == addr,
492             ("local APIC not aligned on a page boundary"));
493         lapic_paddr = addr;
494         lapic_map = pmap_mapdev(addr, PAGE_SIZE);
495         if (x2apic_mode) {
496                 native_lapic_enable_x2apic();
497                 lapic_map = NULL;
498         }
499
500         /* Setup the spurious interrupt handler. */
501         setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
502             GSEL_APIC);
503
504         /* Perform basic initialization of the BSP's local APIC. */
505         lapic_enable();
506
507         /* Set BSP's per-CPU local APIC ID. */
508         PCPU_SET(apic_id, lapic_id());
509
510         /* Local APIC timer interrupt. */
511         setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
512             SDT_APIC, SEL_KPL, GSEL_APIC);
513
514         /* Local APIC error interrupt. */
515         setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
516             SDT_APIC, SEL_KPL, GSEL_APIC);
517
518         /* XXX: Thermal interrupt */
519
520         /* Local APIC CMCI. */
521         setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
522             SDT_APICT, SEL_KPL, GSEL_APIC);
523
524         if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
525                 arat = 0;
526                 /* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */
527                 if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) {
528                         do_cpuid(0x06, regs);
529                         if ((regs[0] & CPUTPM1_ARAT) != 0)
530                                 arat = 1;
531                 } else if (cpu_vendor_id == CPU_VENDOR_AMD &&
532                     CPUID_TO_FAMILY(cpu_id) >= 0x12) {
533                         arat = 1;
534                 }
535                 bzero(&lapic_et, sizeof(lapic_et));
536                 lapic_et.et_name = "LAPIC";
537                 lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT |
538                     ET_FLAGS_PERCPU;
539                 lapic_et.et_quality = 600;
540                 if (!arat) {
541                         lapic_et.et_flags |= ET_FLAGS_C3STOP;
542                         lapic_et.et_quality = 100;
543                 }
544                 if ((cpu_feature & CPUID_TSC) != 0 &&
545                     (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
546                     tsc_is_invariant && tsc_freq != 0) {
547                         lapic_timer_tsc_deadline = 1;
548                         TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
549                             &lapic_timer_tsc_deadline);
550                 }
551
552                 lapic_et.et_frequency = 0;
553                 /* We don't know frequency yet, so trying to guess. */
554                 lapic_et.et_min_period = 0x00001000LL;
555                 lapic_et.et_max_period = SBT_1S;
556                 lapic_et.et_start = lapic_et_start;
557                 lapic_et.et_stop = lapic_et_stop;
558                 lapic_et.et_priv = NULL;
559                 et_register(&lapic_et);
560         }
561
562         /*
563          * Set lapic_eoi_suppression after lapic_enable(), to not
564          * enable suppression in the hardware prematurely.  Note that
565          * we by default enable suppression even when system only has
566          * one IO-APIC, since EOI is broadcasted to all APIC agents,
567          * including CPUs, otherwise.
568          *
569          * It seems that at least some KVM versions report
570          * EOI_SUPPRESSION bit, but auto-EOI does not work.
571          */
572         ver = lapic_read32(LAPIC_VERSION);
573         if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
574                 lapic_eoi_suppression = 1;
575                 if (vm_guest == VM_GUEST_KVM) {
576                         if (bootverbose)
577                                 printf(
578                        "KVM -- disabling lapic eoi suppression\n");
579                         lapic_eoi_suppression = 0;
580                 }
581                 TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
582                     &lapic_eoi_suppression);
583         }
584
585 #ifdef SMP
586 #define LOOPS   100000
587         /*
588          * Calibrate the busy loop waiting for IPI ack in xAPIC mode.
589          * lapic_ipi_wait_mult contains the number of iterations which
590          * approximately delay execution for 1 microsecond (the
591          * argument to native_lapic_ipi_wait() is in microseconds).
592          *
593          * We assume that TSC is present and already measured.
594          * Possible TSC frequency jumps are irrelevant to the
595          * calibration loop below, the CPU clock management code is
596          * not yet started, and we do not enter sleep states.
597          */
598         KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
599             ("TSC not initialized"));
600         if (!x2apic_mode) {
601                 r = rdtsc();
602                 for (rx = 0; rx < LOOPS; rx++) {
603                         (void)lapic_read_icr_lo();
604                         ia32_pause();
605                 }
606                 r = rdtsc() - r;
607                 r1 = tsc_freq * LOOPS;
608                 r2 = r * 1000000;
609                 lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
610                 if (bootverbose) {
611                         printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
612                             "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
613                             (uintmax_t)r, (uintmax_t)tsc_freq);
614                 }
615         }
616 #undef LOOPS
617 #endif /* SMP */
618 }
619
620 /*
621  * Create a local APIC instance.
622  */
623 static void
624 native_lapic_create(u_int apic_id, int boot_cpu)
625 {
626         int i;
627
628         if (apic_id > MAX_APIC_ID) {
629                 printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
630                 if (boot_cpu)
631                         panic("Can't ignore BSP");
632                 return;
633         }
634         KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u",
635             apic_id));
636
637         /*
638          * Assume no local LVT overrides and a cluster of 0 and
639          * intra-cluster ID of 0.
640          */
641         lapics[apic_id].la_present = 1;
642         lapics[apic_id].la_id = apic_id;
643         for (i = 0; i <= APIC_LVT_MAX; i++) {
644                 lapics[apic_id].la_lvts[i] = lvts[i];
645                 lapics[apic_id].la_lvts[i].lvt_active = 0;
646         }
647         for (i = 0; i <= APIC_ELVT_MAX; i++) {
648                 lapics[apic_id].la_elvts[i] = elvts[i];
649                 lapics[apic_id].la_elvts[i].lvt_active = 0;
650         }
651         for (i = 0; i <= APIC_NUM_IOINTS; i++)
652             lapics[apic_id].la_ioint_irqs[i] = IRQ_FREE;
653         lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
654         lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
655             IRQ_TIMER;
656 #ifdef KDTRACE_HOOKS
657         lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] =
658             IRQ_DTRACE_RET;
659 #endif
660 #ifdef XENHVM
661         lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN;
662 #endif
663
664
665 #ifdef SMP
666         cpu_add(apic_id, boot_cpu);
667 #endif
668 }
669
670 static inline uint32_t
671 amd_read_ext_features(void)
672 {
673         uint32_t version;
674
675         if (cpu_vendor_id != CPU_VENDOR_AMD)
676                 return (0);
677         version = lapic_read32(LAPIC_VERSION);
678         if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
679                 return (lapic_read32(LAPIC_EXT_FEATURES));
680         else
681                 return (0);
682 }
683
684 static inline uint32_t
685 amd_read_elvt_count(void)
686 {
687         uint32_t extf;
688         uint32_t count;
689
690         extf = amd_read_ext_features();
691         count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
692         count = min(count, APIC_ELVT_MAX + 1);
693         return (count);
694 }
695
696 /*
697  * Dump contents of local APIC registers
698  */
699 static void
700 native_lapic_dump(const char* str)
701 {
702         uint32_t version;
703         uint32_t maxlvt;
704         uint32_t extf;
705         int elvt_count;
706         int i;
707
708         version = lapic_read32(LAPIC_VERSION);
709         maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
710         printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
711         printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
712             lapic_read32(LAPIC_ID), version,
713             lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
714         if ((cpu_feature2 & CPUID2_X2APIC) != 0)
715                 printf(" x2APIC: %d", x2apic_mode);
716         printf("\n  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
717             lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
718             lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
719         printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
720             lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
721             lapic_read32(LAPIC_LVT_ERROR));
722         if (maxlvt >= APIC_LVT_PMC)
723                 printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
724         printf("\n");
725         if (maxlvt >= APIC_LVT_CMCI)
726                 printf("   cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
727         extf = amd_read_ext_features();
728         if (extf != 0) {
729                 printf("   AMD ext features: 0x%08x\n", extf);
730                 elvt_count = amd_read_elvt_count();
731                 for (i = 0; i < elvt_count; i++)
732                         printf("   AMD elvt%d: 0x%08x\n", i,
733                             lapic_read32(LAPIC_EXT_LVT0 + i));
734         }
735 }
736
737 static void
738 native_lapic_xapic_mode(void)
739 {
740         register_t saveintr;
741
742         saveintr = intr_disable();
743         if (x2apic_mode)
744                 native_lapic_enable_x2apic();
745         intr_restore(saveintr);
746 }
747
748 static void
749 native_lapic_setup(int boot)
750 {
751         struct lapic *la;
752         uint32_t version;
753         uint32_t maxlvt;
754         register_t saveintr;
755         int elvt_count;
756         int i;
757
758         saveintr = intr_disable();
759
760         la = &lapics[lapic_id()];
761         KASSERT(la->la_present, ("missing APIC structure"));
762         version = lapic_read32(LAPIC_VERSION);
763         maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
764
765         /* Initialize the TPR to allow all interrupts. */
766         lapic_set_tpr(0);
767
768         /* Setup spurious vector and enable the local APIC. */
769         lapic_enable();
770
771         /* Program LINT[01] LVT entries. */
772         lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
773             lapic_read32(LAPIC_LVT_LINT0)));
774         lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
775             lapic_read32(LAPIC_LVT_LINT1)));
776
777         /* Program the PMC LVT entry if present. */
778         if (maxlvt >= APIC_LVT_PMC) {
779                 lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
780                     LAPIC_LVT_PCINT));
781         }
782
783         /* Program timer LVT. */
784         la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
785             lapic_read32(LAPIC_LVT_TIMER));
786         la->lvt_timer_last = la->lvt_timer_base;
787         lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
788
789         /* Calibrate the timer parameters using BSP. */
790         if (boot && IS_BSP()) {
791                 lapic_calibrate_initcount(la);
792                 if (lapic_timer_tsc_deadline)
793                         lapic_calibrate_deadline(la);
794         }
795
796         /* Setup the timer if configured. */
797         if (la->la_timer_mode != LAT_MODE_UNDEF) {
798                 KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
799                     lapic_id()));
800                 switch (la->la_timer_mode) {
801                 case LAT_MODE_PERIODIC:
802                         lapic_timer_set_divisor(lapic_timer_divisor);
803                         lapic_timer_periodic(la);
804                         break;
805                 case LAT_MODE_ONESHOT:
806                         lapic_timer_set_divisor(lapic_timer_divisor);
807                         lapic_timer_oneshot(la);
808                         break;
809                 case LAT_MODE_DEADLINE:
810                         lapic_timer_deadline(la);
811                         break;
812                 default:
813                         panic("corrupted la_timer_mode %p %d", la,
814                             la->la_timer_mode);
815                 }
816         }
817
818         /* Program error LVT and clear any existing errors. */
819         lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
820             lapic_read32(LAPIC_LVT_ERROR)));
821         lapic_write32(LAPIC_ESR, 0);
822
823         /* XXX: Thermal LVT */
824
825         /* Program the CMCI LVT entry if present. */
826         if (maxlvt >= APIC_LVT_CMCI) {
827                 lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
828                     lapic_read32(LAPIC_LVT_CMCI)));
829         }
830
831         elvt_count = amd_read_elvt_count();
832         for (i = 0; i < elvt_count; i++) {
833                 if (la->la_elvts[i].lvt_active)
834                         lapic_write32(LAPIC_EXT_LVT0 + i,
835                             elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
836         }
837
838         intr_restore(saveintr);
839 }
840
841 static void
842 native_lapic_intrcnt(void *dummy __unused)
843 {
844         struct pcpu *pc;
845         struct lapic *la;
846         char buf[MAXCOMLEN + 1];
847
848         STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
849                 la = &lapics[pc->pc_apic_id];
850                 if (!la->la_present)
851                     continue;
852
853                 snprintf(buf, sizeof(buf), "cpu%d:timer", pc->pc_cpuid);
854                 intrcnt_add(buf, &la->la_timer_count);
855         }
856 }
857 SYSINIT(native_lapic_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, native_lapic_intrcnt,
858     NULL);
859
860 static void
861 native_lapic_reenable_pmc(void)
862 {
863 #ifdef HWPMC_HOOKS
864         uint32_t value;
865
866         value = lapic_read32(LAPIC_LVT_PCINT);
867         value &= ~APIC_LVT_M;
868         lapic_write32(LAPIC_LVT_PCINT, value);
869 #endif
870 }
871
872 #ifdef HWPMC_HOOKS
873 static void
874 lapic_update_pmc(void *dummy)
875 {
876         struct lapic *la;
877
878         la = &lapics[lapic_id()];
879         lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
880             lapic_read32(LAPIC_LVT_PCINT)));
881 }
882 #endif
883
884 static int
885 native_lapic_enable_pmc(void)
886 {
887 #ifdef HWPMC_HOOKS
888         u_int32_t maxlvt;
889
890         /* Fail if the local APIC is not present. */
891         if (!x2apic_mode && lapic_map == NULL)
892                 return (0);
893
894         /* Fail if the PMC LVT is not present. */
895         maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
896         if (maxlvt < APIC_LVT_PMC)
897                 return (0);
898
899         lvts[APIC_LVT_PMC].lvt_masked = 0;
900
901 #ifdef EARLY_AP_STARTUP
902         MPASS(mp_ncpus == 1 || smp_started);
903         smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
904 #else
905 #ifdef SMP
906         /*
907          * If hwpmc was loaded at boot time then the APs may not be
908          * started yet.  In that case, don't forward the request to
909          * them as they will program the lvt when they start.
910          */
911         if (smp_started)
912                 smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
913         else
914 #endif
915                 lapic_update_pmc(NULL);
916 #endif
917         return (1);
918 #else
919         return (0);
920 #endif
921 }
922
923 static void
924 native_lapic_disable_pmc(void)
925 {
926 #ifdef HWPMC_HOOKS
927         u_int32_t maxlvt;
928
929         /* Fail if the local APIC is not present. */
930         if (!x2apic_mode && lapic_map == NULL)
931                 return;
932
933         /* Fail if the PMC LVT is not present. */
934         maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
935         if (maxlvt < APIC_LVT_PMC)
936                 return;
937
938         lvts[APIC_LVT_PMC].lvt_masked = 1;
939
940 #ifdef SMP
941         /* The APs should always be started when hwpmc is unloaded. */
942         KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early"));
943 #endif
944         smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
945 #endif
946 }
947
948 static void
949 lapic_calibrate_initcount(struct lapic *la)
950 {
951         u_long value;
952
953         /* Start off with a divisor of 2 (power on reset default). */
954         lapic_timer_divisor = 2;
955         /* Try to calibrate the local APIC timer. */
956         do {
957                 lapic_timer_set_divisor(lapic_timer_divisor);
958                 lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
959                 DELAY(1000000);
960                 value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
961                 if (value != APIC_TIMER_MAX_COUNT)
962                         break;
963                 lapic_timer_divisor <<= 1;
964         } while (lapic_timer_divisor <= 128);
965         if (lapic_timer_divisor > 128)
966                 panic("lapic: Divisor too big");
967         if (bootverbose) {
968                 printf("lapic: Divisor %lu, Frequency %lu Hz\n",
969                     lapic_timer_divisor, value);
970         }
971         count_freq = value;
972 }
973
974 static void
975 lapic_calibrate_deadline(struct lapic *la __unused)
976 {
977
978         if (bootverbose) {
979                 printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
980                     (uintmax_t)tsc_freq);
981         }
982 }
983
984 static void
985 lapic_change_mode(struct eventtimer *et, struct lapic *la,
986     enum lat_timer_mode newmode)
987 {
988
989         if (la->la_timer_mode == newmode)
990                 return;
991         switch (newmode) {
992         case LAT_MODE_PERIODIC:
993                 lapic_timer_set_divisor(lapic_timer_divisor);
994                 et->et_frequency = count_freq;
995                 break;
996         case LAT_MODE_DEADLINE:
997                 et->et_frequency = tsc_freq;
998                 break;
999         case LAT_MODE_ONESHOT:
1000                 lapic_timer_set_divisor(lapic_timer_divisor);
1001                 et->et_frequency = count_freq;
1002                 break;
1003         default:
1004                 panic("lapic_change_mode %d", newmode);
1005         }
1006         la->la_timer_mode = newmode;
1007         et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
1008         et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
1009 }
1010
1011 static int
1012 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
1013 {
1014         struct lapic *la;
1015
1016         la = &lapics[PCPU_GET(apic_id)];
1017         if (period != 0) {
1018                 lapic_change_mode(et, la, LAT_MODE_PERIODIC);
1019                 la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
1020                     32;
1021                 lapic_timer_periodic(la);
1022         } else if (lapic_timer_tsc_deadline) {
1023                 lapic_change_mode(et, la, LAT_MODE_DEADLINE);
1024                 la->la_timer_period = (et->et_frequency * first) >> 32;
1025                 lapic_timer_deadline(la);
1026         } else {
1027                 lapic_change_mode(et, la, LAT_MODE_ONESHOT);
1028                 la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
1029                     32;
1030                 lapic_timer_oneshot(la);
1031         }
1032         return (0);
1033 }
1034
1035 static int
1036 lapic_et_stop(struct eventtimer *et)
1037 {
1038         struct lapic *la;
1039
1040         la = &lapics[PCPU_GET(apic_id)];
1041         lapic_timer_stop(la);
1042         la->la_timer_mode = LAT_MODE_UNDEF;
1043         return (0);
1044 }
1045
1046 static void
1047 native_lapic_disable(void)
1048 {
1049         uint32_t value;
1050
1051         /* Software disable the local APIC. */
1052         value = lapic_read32(LAPIC_SVR);
1053         value &= ~APIC_SVR_SWEN;
1054         lapic_write32(LAPIC_SVR, value);
1055 }
1056
1057 static void
1058 lapic_enable(void)
1059 {
1060         uint32_t value;
1061
1062         /* Program the spurious vector to enable the local APIC. */
1063         value = lapic_read32(LAPIC_SVR);
1064         value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
1065         value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
1066         if (lapic_eoi_suppression)
1067                 value |= APIC_SVR_EOI_SUPPRESSION;
1068         lapic_write32(LAPIC_SVR, value);
1069 }
1070
1071 /* Reset the local APIC on the BSP during resume. */
1072 static void
1073 lapic_resume(struct pic *pic, bool suspend_cancelled)
1074 {
1075
1076         lapic_setup(0);
1077 }
1078
1079 static int
1080 native_lapic_id(void)
1081 {
1082         uint32_t v;
1083
1084         KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
1085         v = lapic_read32(LAPIC_ID);
1086         if (!x2apic_mode)
1087                 v >>= APIC_ID_SHIFT;
1088         return (v);
1089 }
1090
1091 static int
1092 native_lapic_intr_pending(u_int vector)
1093 {
1094         uint32_t irr;
1095
1096         /*
1097          * The IRR registers are an array of registers each of which
1098          * only describes 32 interrupts in the low 32 bits.  Thus, we
1099          * divide the vector by 32 to get the register index.
1100          * Finally, we modulus the vector by 32 to determine the
1101          * individual bit to test.
1102          */
1103         irr = lapic_read32(LAPIC_IRR0 + vector / 32);
1104         return (irr & 1 << (vector % 32));
1105 }
1106
1107 static void
1108 native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
1109 {
1110         struct lapic *la;
1111
1112         KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist",
1113             __func__, apic_id));
1114         KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big",
1115             __func__, cluster));
1116         KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID,
1117             ("%s: intra cluster id %u too big", __func__, cluster_id));
1118         la = &lapics[apic_id];
1119         la->la_cluster = cluster;
1120         la->la_cluster_id = cluster_id;
1121 }
1122
1123 static int
1124 native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
1125 {
1126
1127         if (pin > APIC_LVT_MAX)
1128                 return (EINVAL);
1129         if (apic_id == APIC_ID_ALL) {
1130                 lvts[pin].lvt_masked = masked;
1131                 if (bootverbose)
1132                         printf("lapic:");
1133         } else {
1134                 KASSERT(lapics[apic_id].la_present,
1135                     ("%s: missing APIC %u", __func__, apic_id));
1136                 lapics[apic_id].la_lvts[pin].lvt_masked = masked;
1137                 lapics[apic_id].la_lvts[pin].lvt_active = 1;
1138                 if (bootverbose)
1139                         printf("lapic%u:", apic_id);
1140         }
1141         if (bootverbose)
1142                 printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked");
1143         return (0);
1144 }
1145
1146 static int
1147 native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
1148 {
1149         struct lvt *lvt;
1150
1151         if (pin > APIC_LVT_MAX)
1152                 return (EINVAL);
1153         if (apic_id == APIC_ID_ALL) {
1154                 lvt = &lvts[pin];
1155                 if (bootverbose)
1156                         printf("lapic:");
1157         } else {
1158                 KASSERT(lapics[apic_id].la_present,
1159                     ("%s: missing APIC %u", __func__, apic_id));
1160                 lvt = &lapics[apic_id].la_lvts[pin];
1161                 lvt->lvt_active = 1;
1162                 if (bootverbose)
1163                         printf("lapic%u:", apic_id);
1164         }
1165         lvt->lvt_mode = mode;
1166         switch (mode) {
1167         case APIC_LVT_DM_NMI:
1168         case APIC_LVT_DM_SMI:
1169         case APIC_LVT_DM_INIT:
1170         case APIC_LVT_DM_EXTINT:
1171                 lvt->lvt_edgetrigger = 1;
1172                 lvt->lvt_activehi = 1;
1173                 if (mode == APIC_LVT_DM_EXTINT)
1174                         lvt->lvt_masked = 1;
1175                 else
1176                         lvt->lvt_masked = 0;
1177                 break;
1178         default:
1179                 panic("Unsupported delivery mode: 0x%x\n", mode);
1180         }
1181         if (bootverbose) {
1182                 printf(" Routing ");
1183                 switch (mode) {
1184                 case APIC_LVT_DM_NMI:
1185                         printf("NMI");
1186                         break;
1187                 case APIC_LVT_DM_SMI:
1188                         printf("SMI");
1189                         break;
1190                 case APIC_LVT_DM_INIT:
1191                         printf("INIT");
1192                         break;
1193                 case APIC_LVT_DM_EXTINT:
1194                         printf("ExtINT");
1195                         break;
1196                 }
1197                 printf(" -> LINT%u\n", pin);
1198         }
1199         return (0);
1200 }
1201
1202 static int
1203 native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
1204 {
1205
1206         if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
1207                 return (EINVAL);
1208         if (apic_id == APIC_ID_ALL) {
1209                 lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH);
1210                 if (bootverbose)
1211                         printf("lapic:");
1212         } else {
1213                 KASSERT(lapics[apic_id].la_present,
1214                     ("%s: missing APIC %u", __func__, apic_id));
1215                 lapics[apic_id].la_lvts[pin].lvt_active = 1;
1216                 lapics[apic_id].la_lvts[pin].lvt_activehi =
1217                     (pol == INTR_POLARITY_HIGH);
1218                 if (bootverbose)
1219                         printf("lapic%u:", apic_id);
1220         }
1221         if (bootverbose)
1222                 printf(" LINT%u polarity: %s\n", pin,
1223                     pol == INTR_POLARITY_HIGH ? "high" : "low");
1224         return (0);
1225 }
1226
1227 static int
1228 native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
1229      enum intr_trigger trigger)
1230 {
1231
1232         if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
1233                 return (EINVAL);
1234         if (apic_id == APIC_ID_ALL) {
1235                 lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
1236                 if (bootverbose)
1237                         printf("lapic:");
1238         } else {
1239                 KASSERT(lapics[apic_id].la_present,
1240                     ("%s: missing APIC %u", __func__, apic_id));
1241                 lapics[apic_id].la_lvts[pin].lvt_edgetrigger =
1242                     (trigger == INTR_TRIGGER_EDGE);
1243                 lapics[apic_id].la_lvts[pin].lvt_active = 1;
1244                 if (bootverbose)
1245                         printf("lapic%u:", apic_id);
1246         }
1247         if (bootverbose)
1248                 printf(" LINT%u trigger: %s\n", pin,
1249                     trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
1250         return (0);
1251 }
1252
1253 /*
1254  * Adjust the TPR of the current CPU so that it blocks all interrupts below
1255  * the passed in vector.
1256  */
1257 static void
1258 lapic_set_tpr(u_int vector)
1259 {
1260 #ifdef CHEAP_TPR
1261         lapic_write32(LAPIC_TPR, vector);
1262 #else
1263         uint32_t tpr;
1264
1265         tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
1266         tpr |= vector;
1267         lapic_write32(LAPIC_TPR, tpr);
1268 #endif
1269 }
1270
1271 static void
1272 native_lapic_eoi(void)
1273 {
1274
1275         lapic_write32_nofence(LAPIC_EOI, 0);
1276 }
1277
1278 void
1279 lapic_handle_intr(int vector, struct trapframe *frame)
1280 {
1281         struct intsrc *isrc;
1282
1283         isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id),
1284             vector));
1285         intr_execute_handlers(isrc, frame);
1286 }
1287
1288 void
1289 lapic_handle_timer(struct trapframe *frame)
1290 {
1291         struct lapic *la;
1292         struct trapframe *oldframe;
1293         struct thread *td;
1294
1295         /* Send EOI first thing. */
1296         lapic_eoi();
1297
1298 #if defined(SMP) && !defined(SCHED_ULE)
1299         /*
1300          * Don't do any accounting for the disabled HTT cores, since it
1301          * will provide misleading numbers for the userland.
1302          *
1303          * No locking is necessary here, since even if we lose the race
1304          * when hlt_cpus_mask changes it is not a big deal, really.
1305          *
1306          * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask
1307          * and unlike other schedulers it actually schedules threads to
1308          * those CPUs.
1309          */
1310         if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask))
1311                 return;
1312 #endif
1313
1314         /* Look up our local APIC structure for the tick counters. */
1315         la = &lapics[PCPU_GET(apic_id)];
1316         (*la->la_timer_count)++;
1317         critical_enter();
1318         if (lapic_et.et_active) {
1319                 td = curthread;
1320                 td->td_intr_nesting_level++;
1321                 oldframe = td->td_intr_frame;
1322                 td->td_intr_frame = frame;
1323                 lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg);
1324                 td->td_intr_frame = oldframe;
1325                 td->td_intr_nesting_level--;
1326         }
1327         critical_exit();
1328 }
1329
1330 static void
1331 lapic_timer_set_divisor(u_int divisor)
1332 {
1333
1334         KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
1335         KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
1336                 ("lapic: invalid divisor %u", divisor));
1337         lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
1338 }
1339
1340 static void
1341 lapic_timer_oneshot(struct lapic *la)
1342 {
1343         uint32_t value;
1344
1345         value = la->lvt_timer_base;
1346         value &= ~(APIC_LVTT_TM | APIC_LVT_M);
1347         value |= APIC_LVTT_TM_ONE_SHOT;
1348         la->lvt_timer_last = value;
1349         lapic_write32(LAPIC_LVT_TIMER, value);
1350         lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
1351 }
1352
1353 static void
1354 lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
1355 {
1356         uint32_t value;
1357
1358         value = la->lvt_timer_base;
1359         value &= ~APIC_LVTT_TM;
1360         value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
1361         la->lvt_timer_last = value;
1362         lapic_write32(LAPIC_LVT_TIMER, value);
1363         lapic_write32(LAPIC_ICR_TIMER, count);
1364 }
1365
1366 static void
1367 lapic_timer_periodic(struct lapic *la)
1368 {
1369         uint32_t value;
1370
1371         value = la->lvt_timer_base;
1372         value &= ~(APIC_LVTT_TM | APIC_LVT_M);
1373         value |= APIC_LVTT_TM_PERIODIC;
1374         la->lvt_timer_last = value;
1375         lapic_write32(LAPIC_LVT_TIMER, value);
1376         lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
1377 }
1378
1379 static void
1380 lapic_timer_deadline(struct lapic *la)
1381 {
1382         uint32_t value;
1383
1384         value = la->lvt_timer_base;
1385         value &= ~(APIC_LVTT_TM | APIC_LVT_M);
1386         value |= APIC_LVTT_TM_TSCDLT;
1387         if (value != la->lvt_timer_last) {
1388                 la->lvt_timer_last = value;
1389                 lapic_write32_nofence(LAPIC_LVT_TIMER, value);
1390                 if (!x2apic_mode)
1391                         mfence();
1392         }
1393         wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
1394 }
1395
1396 static void
1397 lapic_timer_stop(struct lapic *la)
1398 {
1399         uint32_t value;
1400
1401         if (la->la_timer_mode == LAT_MODE_DEADLINE) {
1402                 wrmsr(MSR_TSC_DEADLINE, 0);
1403                 mfence();
1404         } else {
1405                 value = la->lvt_timer_base;
1406                 value &= ~APIC_LVTT_TM;
1407                 value |= APIC_LVT_M;
1408                 la->lvt_timer_last = value;
1409                 lapic_write32(LAPIC_LVT_TIMER, value);
1410         }
1411 }
1412
1413 void
1414 lapic_handle_cmc(void)
1415 {
1416
1417         lapic_eoi();
1418         cmc_intr();
1419 }
1420
1421 /*
1422  * Called from the mca_init() to activate the CMC interrupt if this CPU is
1423  * responsible for monitoring any MC banks for CMC events.  Since mca_init()
1424  * is called prior to lapic_setup() during boot, this just needs to unmask
1425  * this CPU's LVT_CMCI entry.
1426  */
1427 static void
1428 native_lapic_enable_cmc(void)
1429 {
1430         u_int apic_id;
1431
1432 #ifdef DEV_ATPIC
1433         if (!x2apic_mode && lapic_map == NULL)
1434                 return;
1435 #endif
1436         apic_id = PCPU_GET(apic_id);
1437         KASSERT(lapics[apic_id].la_present,
1438             ("%s: missing APIC %u", __func__, apic_id));
1439         lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_masked = 0;
1440         lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_active = 1;
1441         if (bootverbose)
1442                 printf("lapic%u: CMCI unmasked\n", apic_id);
1443 }
1444
1445 static int
1446 native_lapic_enable_mca_elvt(void)
1447 {
1448         u_int apic_id;
1449         uint32_t value;
1450         int elvt_count;
1451
1452 #ifdef DEV_ATPIC
1453         if (lapic_map == NULL)
1454                 return (-1);
1455 #endif
1456
1457         apic_id = PCPU_GET(apic_id);
1458         KASSERT(lapics[apic_id].la_present,
1459             ("%s: missing APIC %u", __func__, apic_id));
1460         elvt_count = amd_read_elvt_count();
1461         if (elvt_count <= APIC_ELVT_MCA)
1462                 return (-1);
1463
1464         value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
1465         if ((value & APIC_LVT_M) == 0) {
1466                 printf("AMD MCE Thresholding Extended LVT is already active\n");
1467                 return (-1);
1468         }
1469         lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
1470         lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
1471         if (bootverbose)
1472                 printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id);
1473         return (APIC_ELVT_MCA);
1474 }
1475
1476 void
1477 lapic_handle_error(void)
1478 {
1479         uint32_t esr;
1480
1481         /*
1482          * Read the contents of the error status register.  Write to
1483          * the register first before reading from it to force the APIC
1484          * to update its value to indicate any errors that have
1485          * occurred since the previous write to the register.
1486          */
1487         lapic_write32(LAPIC_ESR, 0);
1488         esr = lapic_read32(LAPIC_ESR);
1489
1490         printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
1491         lapic_eoi();
1492 }
1493
1494 static u_int
1495 native_apic_cpuid(u_int apic_id)
1496 {
1497 #ifdef SMP
1498         return apic_cpuids[apic_id];
1499 #else
1500         return 0;
1501 #endif
1502 }
1503
1504 /* Request a free IDT vector to be used by the specified IRQ. */
1505 static u_int
1506 native_apic_alloc_vector(u_int apic_id, u_int irq)
1507 {
1508         u_int vector;
1509
1510         KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
1511
1512         /*
1513          * Search for a free vector.  Currently we just use a very simple
1514          * algorithm to find the first free vector.
1515          */
1516         mtx_lock_spin(&icu_lock);
1517         for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
1518                 if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE)
1519                         continue;
1520                 lapics[apic_id].la_ioint_irqs[vector] = irq;
1521                 mtx_unlock_spin(&icu_lock);
1522                 return (vector + APIC_IO_INTS);
1523         }
1524         mtx_unlock_spin(&icu_lock);
1525         return (0);
1526 }
1527
1528 /*
1529  * Request 'count' free contiguous IDT vectors to be used by 'count'
1530  * IRQs.  'count' must be a power of two and the vectors will be
1531  * aligned on a boundary of 'align'.  If the request cannot be
1532  * satisfied, 0 is returned.
1533  */
1534 static u_int
1535 native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
1536 {
1537         u_int first, run, vector;
1538
1539         KASSERT(powerof2(count), ("bad count"));
1540         KASSERT(powerof2(align), ("bad align"));
1541         KASSERT(align >= count, ("align < count"));
1542 #ifdef INVARIANTS
1543         for (run = 0; run < count; run++)
1544                 KASSERT(irqs[run] < num_io_irqs, ("Invalid IRQ %u at index %u",
1545                     irqs[run], run));
1546 #endif
1547
1548         /*
1549          * Search for 'count' free vectors.  As with apic_alloc_vector(),
1550          * this just uses a simple first fit algorithm.
1551          */
1552         run = 0;
1553         first = 0;
1554         mtx_lock_spin(&icu_lock);
1555         for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
1556
1557                 /* Vector is in use, end run. */
1558                 if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) {
1559                         run = 0;
1560                         first = 0;
1561                         continue;
1562                 }
1563
1564                 /* Start a new run if run == 0 and vector is aligned. */
1565                 if (run == 0) {
1566                         if ((vector & (align - 1)) != 0)
1567                                 continue;
1568                         first = vector;
1569                 }
1570                 run++;
1571
1572                 /* Keep looping if the run isn't long enough yet. */
1573                 if (run < count)
1574                         continue;
1575
1576                 /* Found a run, assign IRQs and return the first vector. */
1577                 for (vector = 0; vector < count; vector++)
1578                         lapics[apic_id].la_ioint_irqs[first + vector] =
1579                             irqs[vector];
1580                 mtx_unlock_spin(&icu_lock);
1581                 return (first + APIC_IO_INTS);
1582         }
1583         mtx_unlock_spin(&icu_lock);
1584         printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count);
1585         return (0);
1586 }
1587
1588 /*
1589  * Enable a vector for a particular apic_id.  Since all lapics share idt
1590  * entries and ioint_handlers this enables the vector on all lapics.  lapics
1591  * which do not have the vector configured would report spurious interrupts
1592  * should it fire.
1593  */
1594 static void
1595 native_apic_enable_vector(u_int apic_id, u_int vector)
1596 {
1597
1598         KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
1599         KASSERT(ioint_handlers[vector / 32] != NULL,
1600             ("No ISR handler for vector %u", vector));
1601 #ifdef KDTRACE_HOOKS
1602         KASSERT(vector != IDT_DTRACE_RET,
1603             ("Attempt to overwrite DTrace entry"));
1604 #endif
1605         setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
1606             SDT_APIC, SEL_KPL, GSEL_APIC);
1607 }
1608
1609 static void
1610 native_apic_disable_vector(u_int apic_id, u_int vector)
1611 {
1612
1613         KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
1614 #ifdef KDTRACE_HOOKS
1615         KASSERT(vector != IDT_DTRACE_RET,
1616             ("Attempt to overwrite DTrace entry"));
1617 #endif
1618         KASSERT(ioint_handlers[vector / 32] != NULL,
1619             ("No ISR handler for vector %u", vector));
1620 #ifdef notyet
1621         /*
1622          * We can not currently clear the idt entry because other cpus
1623          * may have a valid vector at this offset.
1624          */
1625         setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
1626             SEL_KPL, GSEL_APIC);
1627 #endif
1628 }
1629
1630 /* Release an APIC vector when it's no longer in use. */
1631 static void
1632 native_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
1633 {
1634         struct thread *td;
1635
1636         KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
1637             vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
1638             ("Vector %u does not map to an IRQ line", vector));
1639         KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
1640         KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
1641             irq, ("IRQ mismatch"));
1642 #ifdef KDTRACE_HOOKS
1643         KASSERT(vector != IDT_DTRACE_RET,
1644             ("Attempt to overwrite DTrace entry"));
1645 #endif
1646
1647         /*
1648          * Bind us to the cpu that owned the vector before freeing it so
1649          * we don't lose an interrupt delivery race.
1650          */
1651         td = curthread;
1652         if (!rebooting) {
1653                 thread_lock(td);
1654                 if (sched_is_bound(td))
1655                         panic("apic_free_vector: Thread already bound.\n");
1656                 sched_bind(td, apic_cpuid(apic_id));
1657                 thread_unlock(td);
1658         }
1659         mtx_lock_spin(&icu_lock);
1660         lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = IRQ_FREE;
1661         mtx_unlock_spin(&icu_lock);
1662         if (!rebooting) {
1663                 thread_lock(td);
1664                 sched_unbind(td);
1665                 thread_unlock(td);
1666         }
1667 }
1668
1669 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
1670 static u_int
1671 apic_idt_to_irq(u_int apic_id, u_int vector)
1672 {
1673         int irq;
1674
1675         KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
1676             vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
1677             ("Vector %u does not map to an IRQ line", vector));
1678 #ifdef KDTRACE_HOOKS
1679         KASSERT(vector != IDT_DTRACE_RET,
1680             ("Attempt to overwrite DTrace entry"));
1681 #endif
1682         irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS];
1683         if (irq < 0)
1684                 irq = 0;
1685         return (irq);
1686 }
1687
1688 #ifdef DDB
1689 /*
1690  * Dump data about APIC IDT vector mappings.
1691  */
1692 DB_SHOW_COMMAND(apic, db_show_apic)
1693 {
1694         struct intsrc *isrc;
1695         int i, verbose;
1696         u_int apic_id;
1697         u_int irq;
1698
1699         if (strcmp(modif, "vv") == 0)
1700                 verbose = 2;
1701         else if (strcmp(modif, "v") == 0)
1702                 verbose = 1;
1703         else
1704                 verbose = 0;
1705         for (apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) {
1706                 if (lapics[apic_id].la_present == 0)
1707                         continue;
1708                 db_printf("Interrupts bound to lapic %u\n", apic_id);
1709                 for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
1710                         irq = lapics[apic_id].la_ioint_irqs[i];
1711                         if (irq == IRQ_FREE || irq == IRQ_SYSCALL)
1712                                 continue;
1713 #ifdef KDTRACE_HOOKS
1714                         if (irq == IRQ_DTRACE_RET)
1715                                 continue;
1716 #endif
1717 #ifdef XENHVM
1718                         if (irq == IRQ_EVTCHN)
1719                                 continue;
1720 #endif
1721                         db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
1722                         if (irq == IRQ_TIMER)
1723                                 db_printf("lapic timer\n");
1724                         else if (irq < num_io_irqs) {
1725                                 isrc = intr_lookup_source(irq);
1726                                 if (isrc == NULL || verbose == 0)
1727                                         db_printf("IRQ %u\n", irq);
1728                                 else
1729                                         db_dump_intr_event(isrc->is_event,
1730                                             verbose == 2);
1731                         } else
1732                                 db_printf("IRQ %u ???\n", irq);
1733                 }
1734         }
1735 }
1736
1737 static void
1738 dump_mask(const char *prefix, uint32_t v, int base)
1739 {
1740         int i, first;
1741
1742         first = 1;
1743         for (i = 0; i < 32; i++)
1744                 if (v & (1 << i)) {
1745                         if (first) {
1746                                 db_printf("%s:", prefix);
1747                                 first = 0;
1748                         }
1749                         db_printf(" %02x", base + i);
1750                 }
1751         if (!first)
1752                 db_printf("\n");
1753 }
1754
1755 /* Show info from the lapic regs for this CPU. */
1756 DB_SHOW_COMMAND(lapic, db_show_lapic)
1757 {
1758         uint32_t v;
1759
1760         db_printf("lapic ID = %d\n", lapic_id());
1761         v = lapic_read32(LAPIC_VERSION);
1762         db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
1763             v & 0xf);
1764         db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
1765         v = lapic_read32(LAPIC_SVR);
1766         db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
1767             v & APIC_SVR_ENABLE ? "enabled" : "disabled");
1768         db_printf("TPR      = %02x\n", lapic_read32(LAPIC_TPR));
1769
1770 #define dump_field(prefix, regn, index)                                 \
1771         dump_mask(__XSTRING(prefix ## index),                           \
1772             lapic_read32(LAPIC_ ## regn ## index),                      \
1773             index * 32)
1774
1775         db_printf("In-service Interrupts:\n");
1776         dump_field(isr, ISR, 0);
1777         dump_field(isr, ISR, 1);
1778         dump_field(isr, ISR, 2);
1779         dump_field(isr, ISR, 3);
1780         dump_field(isr, ISR, 4);
1781         dump_field(isr, ISR, 5);
1782         dump_field(isr, ISR, 6);
1783         dump_field(isr, ISR, 7);
1784
1785         db_printf("TMR Interrupts:\n");
1786         dump_field(tmr, TMR, 0);
1787         dump_field(tmr, TMR, 1);
1788         dump_field(tmr, TMR, 2);
1789         dump_field(tmr, TMR, 3);
1790         dump_field(tmr, TMR, 4);
1791         dump_field(tmr, TMR, 5);
1792         dump_field(tmr, TMR, 6);
1793         dump_field(tmr, TMR, 7);
1794
1795         db_printf("IRR Interrupts:\n");
1796         dump_field(irr, IRR, 0);
1797         dump_field(irr, IRR, 1);
1798         dump_field(irr, IRR, 2);
1799         dump_field(irr, IRR, 3);
1800         dump_field(irr, IRR, 4);
1801         dump_field(irr, IRR, 5);
1802         dump_field(irr, IRR, 6);
1803         dump_field(irr, IRR, 7);
1804
1805 #undef dump_field
1806 }
1807 #endif
1808
1809 /*
1810  * APIC probing support code.  This includes code to manage enumerators.
1811  */
1812
1813 static SLIST_HEAD(, apic_enumerator) enumerators =
1814         SLIST_HEAD_INITIALIZER(enumerators);
1815 static struct apic_enumerator *best_enum;
1816
1817 void
1818 apic_register_enumerator(struct apic_enumerator *enumerator)
1819 {
1820 #ifdef INVARIANTS
1821         struct apic_enumerator *apic_enum;
1822
1823         SLIST_FOREACH(apic_enum, &enumerators, apic_next) {
1824                 if (apic_enum == enumerator)
1825                         panic("%s: Duplicate register of %s", __func__,
1826                             enumerator->apic_name);
1827         }
1828 #endif
1829         SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next);
1830 }
1831
1832 /*
1833  * We have to look for CPU's very, very early because certain subsystems
1834  * want to know how many CPU's we have extremely early on in the boot
1835  * process.
1836  */
1837 static void
1838 apic_init(void *dummy __unused)
1839 {
1840         struct apic_enumerator *enumerator;
1841         int retval, best;
1842
1843         /* We only support built in local APICs. */
1844         if (!(cpu_feature & CPUID_APIC))
1845                 return;
1846
1847         /* Don't probe if APIC mode is disabled. */
1848         if (resource_disabled("apic", 0))
1849                 return;
1850
1851         /* Probe all the enumerators to find the best match. */
1852         best_enum = NULL;
1853         best = 0;
1854         SLIST_FOREACH(enumerator, &enumerators, apic_next) {
1855                 retval = enumerator->apic_probe();
1856                 if (retval > 0)
1857                         continue;
1858                 if (best_enum == NULL || best < retval) {
1859                         best_enum = enumerator;
1860                         best = retval;
1861                 }
1862         }
1863         if (best_enum == NULL) {
1864                 if (bootverbose)
1865                         printf("APIC: Could not find any APICs.\n");
1866 #ifndef DEV_ATPIC
1867                 panic("running without device atpic requires a local APIC");
1868 #endif
1869                 return;
1870         }
1871
1872         if (bootverbose)
1873                 printf("APIC: Using the %s enumerator.\n",
1874                     best_enum->apic_name);
1875
1876 #ifdef I686_CPU
1877         /*
1878          * To work around an errata, we disable the local APIC on some
1879          * CPUs during early startup.  We need to turn the local APIC back
1880          * on on such CPUs now.
1881          */
1882         ppro_reenable_apic();
1883 #endif
1884
1885         /* Probe the CPU's in the system. */
1886         retval = best_enum->apic_probe_cpus();
1887         if (retval != 0)
1888                 printf("%s: Failed to probe CPUs: returned %d\n",
1889                     best_enum->apic_name, retval);
1890
1891 }
1892 SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL);
1893
1894 /*
1895  * Setup the local APIC.  We have to do this prior to starting up the APs
1896  * in the SMP case.
1897  */
1898 static void
1899 apic_setup_local(void *dummy __unused)
1900 {
1901         int retval;
1902
1903         if (best_enum == NULL)
1904                 return;
1905
1906         /* Initialize the local APIC. */
1907         retval = best_enum->apic_setup_local();
1908         if (retval != 0)
1909                 printf("%s: Failed to setup the local APIC: returned %d\n",
1910                     best_enum->apic_name, retval);
1911 }
1912 SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL);
1913
1914 /*
1915  * Setup the I/O APICs.
1916  */
1917 static void
1918 apic_setup_io(void *dummy __unused)
1919 {
1920         int retval;
1921
1922         if (best_enum == NULL)
1923                 return;
1924
1925         /*
1926          * Local APIC must be registered before other PICs and pseudo PICs
1927          * for proper suspend/resume order.
1928          */
1929         intr_register_pic(&lapic_pic);
1930
1931         retval = best_enum->apic_setup_io();
1932         if (retval != 0)
1933                 printf("%s: Failed to setup I/O APICs: returned %d\n",
1934                     best_enum->apic_name, retval);
1935
1936         /*
1937          * Finish setting up the local APIC on the BSP once we know
1938          * how to properly program the LINT pins.  In particular, this
1939          * enables the EOI suppression mode, if LAPIC support it and
1940          * user did not disabled the mode.
1941          */
1942         lapic_setup(1);
1943         if (bootverbose)
1944                 lapic_dump("BSP");
1945
1946         /* Enable the MSI "pic". */
1947         init_ops.msi_init();
1948
1949 #ifdef XENHVM
1950         xen_intr_alloc_irqs();
1951 #endif
1952 }
1953 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
1954
1955 #ifdef SMP
1956 /*
1957  * Inter Processor Interrupt functions.  The lapic_ipi_*() functions are
1958  * private to the MD code.  The public interface for the rest of the
1959  * kernel is defined in mp_machdep.c.
1960  */
1961
1962 /*
1963  * Wait delay microseconds for IPI to be sent.  If delay is -1, we
1964  * wait forever.
1965  */
1966 static int
1967 native_lapic_ipi_wait(int delay)
1968 {
1969         uint64_t rx;
1970
1971         /* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
1972         if (x2apic_mode)
1973                 return (1);
1974
1975         for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
1976                 if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
1977                     APIC_DELSTAT_IDLE)
1978                         return (1);
1979                 ia32_pause();
1980         }
1981         return (0);
1982 }
1983
1984 static void
1985 native_lapic_ipi_raw(register_t icrlo, u_int dest)
1986 {
1987         uint64_t icr;
1988         uint32_t vhi, vlo;
1989         register_t saveintr;
1990
1991         /* XXX: Need more sanity checking of icrlo? */
1992         KASSERT(x2apic_mode || lapic_map != NULL,
1993             ("%s called too early", __func__));
1994         KASSERT(x2apic_mode ||
1995             (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
1996             ("%s: invalid dest field", __func__));
1997         KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
1998             ("%s: reserved bits set in ICR LO register", __func__));
1999
2000         /* Set destination in ICR HI register if it is being used. */
2001         if (!x2apic_mode) {
2002                 saveintr = intr_disable();
2003                 icr = lapic_read_icr();
2004         }
2005
2006         if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
2007                 if (x2apic_mode) {
2008                         vhi = dest;
2009                 } else {
2010                         vhi = icr >> 32;
2011                         vhi &= ~APIC_ID_MASK;
2012                         vhi |= dest << APIC_ID_SHIFT;
2013                 }
2014         } else {
2015                 vhi = 0;
2016         }
2017
2018         /* Program the contents of the IPI and dispatch it. */
2019         if (x2apic_mode) {
2020                 vlo = icrlo;
2021         } else {
2022                 vlo = icr;
2023                 vlo &= APIC_ICRLO_RESV_MASK;
2024                 vlo |= icrlo;
2025         }
2026         lapic_write_icr(vhi, vlo);
2027         if (!x2apic_mode)
2028                 intr_restore(saveintr);
2029 }
2030
2031 #define BEFORE_SPIN     50000
2032 #ifdef DETECT_DEADLOCK
2033 #define AFTER_SPIN      50
2034 #endif
2035
2036 static void
2037 native_lapic_ipi_vectored(u_int vector, int dest)
2038 {
2039         register_t icrlo, destfield;
2040
2041         KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
2042             ("%s: invalid vector %d", __func__, vector));
2043
2044         icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
2045
2046         /*
2047          * NMI IPIs are just fake vectors used to send a NMI.  Use special rules
2048          * regarding NMIs if passed, otherwise specify the vector.
2049          */
2050         if (vector >= IPI_NMI_FIRST)
2051                 icrlo |= APIC_DELMODE_NMI;
2052         else
2053                 icrlo |= vector | APIC_DELMODE_FIXED;
2054         destfield = 0;
2055         switch (dest) {
2056         case APIC_IPI_DEST_SELF:
2057                 icrlo |= APIC_DEST_SELF;
2058                 break;
2059         case APIC_IPI_DEST_ALL:
2060                 icrlo |= APIC_DEST_ALLISELF;
2061                 break;
2062         case APIC_IPI_DEST_OTHERS:
2063                 icrlo |= APIC_DEST_ALLESELF;
2064                 break;
2065         default:
2066                 KASSERT(x2apic_mode ||
2067                     (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
2068                     ("%s: invalid destination 0x%x", __func__, dest));
2069                 destfield = dest;
2070         }
2071
2072         /* Wait for an earlier IPI to finish. */
2073         if (!lapic_ipi_wait(BEFORE_SPIN)) {
2074                 if (panicstr != NULL)
2075                         return;
2076                 else
2077                         panic("APIC: Previous IPI is stuck");
2078         }
2079
2080         lapic_ipi_raw(icrlo, destfield);
2081
2082 #ifdef DETECT_DEADLOCK
2083         /* Wait for IPI to be delivered. */
2084         if (!lapic_ipi_wait(AFTER_SPIN)) {
2085 #ifdef needsattention
2086                 /*
2087                  * XXX FIXME:
2088                  *
2089                  * The above function waits for the message to actually be
2090                  * delivered.  It breaks out after an arbitrary timeout
2091                  * since the message should eventually be delivered (at
2092                  * least in theory) and that if it wasn't we would catch
2093                  * the failure with the check above when the next IPI is
2094                  * sent.
2095                  *
2096                  * We could skip this wait entirely, EXCEPT it probably
2097                  * protects us from other routines that assume that the
2098                  * message was delivered and acted upon when this function
2099                  * returns.
2100                  */
2101                 printf("APIC: IPI might be stuck\n");
2102 #else /* !needsattention */
2103                 /* Wait until mesage is sent without a timeout. */
2104                 while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
2105                         ia32_pause();
2106 #endif /* needsattention */
2107         }
2108 #endif /* DETECT_DEADLOCK */
2109 }
2110
2111 #endif /* SMP */
2112
2113 /*
2114  * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
2115  * visible.
2116  *
2117  * Consider the case where an IPI is generated immediately after allocation:
2118  *     vector = lapic_ipi_alloc(ipifunc);
2119  *     ipi_selected(other_cpus, vector);
2120  *
2121  * In xAPIC mode a write to ICR_LO has serializing semantics because the
2122  * APIC page is mapped as an uncached region. In x2APIC mode there is an
2123  * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
2124  * the IDT slot update is globally visible before the IPI is delivered.
2125  */
2126 static int
2127 native_lapic_ipi_alloc(inthand_t *ipifunc)
2128 {
2129         struct gate_descriptor *ip;
2130         long func;
2131         int idx, vector;
2132
2133         KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
2134             ("invalid ipifunc %p", ipifunc));
2135
2136         vector = -1;
2137         mtx_lock_spin(&icu_lock);
2138         for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
2139                 ip = &idt[idx];
2140                 func = (ip->gd_hioffset << 16) | ip->gd_looffset;
2141                 if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
2142                     (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
2143                         vector = idx;
2144                         setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
2145                         break;
2146                 }
2147         }
2148         mtx_unlock_spin(&icu_lock);
2149         return (vector);
2150 }
2151
2152 static void
2153 native_lapic_ipi_free(int vector)
2154 {
2155         struct gate_descriptor *ip;
2156         long func;
2157
2158         KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
2159             ("%s: invalid vector %d", __func__, vector));
2160
2161         mtx_lock_spin(&icu_lock);
2162         ip = &idt[vector];
2163         func = (ip->gd_hioffset << 16) | ip->gd_looffset;
2164         KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
2165             func != (uintptr_t)&IDTVEC(rsvd_pti),
2166             ("invalid idtfunc %#lx", func));
2167         setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
2168             SEL_KPL, GSEL_APIC);
2169         mtx_unlock_spin(&icu_lock);
2170 }