]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/io/vlapic.c
Add an API to deliver message signalled interrupts to vcpus. This allows
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / io / vlapic.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/lock.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/mutex.h>
37 #include <sys/systm.h>
38 #include <sys/smp.h>
39
40 #include <machine/clock.h>
41 #include <x86/specialreg.h>
42 #include <x86/apicreg.h>
43
44 #include <machine/vmm.h>
45
46 #include "vmm_stat.h"
47 #include "vmm_lapic.h"
48 #include "vmm_ktr.h"
49 #include "vlapic.h"
50 #include "vioapic.h"
51
52 #define VLAPIC_CTR0(vlapic, format)                                     \
53         VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
54
55 #define VLAPIC_CTR1(vlapic, format, p1)                                 \
56         VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
57
58 #define VLAPIC_CTR2(vlapic, format, p1, p2)                             \
59         VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2)
60
61 #define VLAPIC_CTR_IRR(vlapic, msg)                                     \
62 do {                                                                    \
63         uint32_t *irrptr = &(vlapic)->apic.irr0;                        \
64         irrptr[0] = irrptr[0];  /* silence compiler */                  \
65         VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]);      \
66         VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]);      \
67         VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]);      \
68         VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]);      \
69         VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]);      \
70         VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]);      \
71         VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]);      \
72         VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]);      \
73 } while (0)
74
75 #define VLAPIC_CTR_ISR(vlapic, msg)                                     \
76 do {                                                                    \
77         uint32_t *isrptr = &(vlapic)->apic.isr0;                        \
78         isrptr[0] = isrptr[0];  /* silence compiler */                  \
79         VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]);      \
80         VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]);      \
81         VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]);      \
82         VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]);      \
83         VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]);      \
84         VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]);      \
85         VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]);      \
86         VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);      \
87 } while (0)
88
89 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
90
91 #define PRIO(x)                 ((x) >> 4)
92
93 #define VLAPIC_VERSION          (16)
94 #define VLAPIC_MAXLVT_ENTRIES   (5)
95
96 #define x2apic(vlapic)  (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
97
98 enum boot_state {
99         BS_INIT,
100         BS_SIPI,
101         BS_RUNNING
102 };
103
104 struct vlapic {
105         struct vm               *vm;
106         int                     vcpuid;
107
108         struct LAPIC            apic;
109
110         int                      esr_update;
111
112         struct callout  callout;        /* vlapic timer */
113         struct bintime  timer_fire_bt;  /* callout expiry time */
114         struct bintime  timer_freq_bt;  /* timer frequency */
115         struct bintime  timer_period_bt; /* timer period */
116         struct mtx      timer_mtx;
117
118         /*
119          * The 'isrvec_stk' is a stack of vectors injected by the local apic.
120          * A vector is popped from the stack when the processor does an EOI.
121          * The vector on the top of the stack is used to compute the
122          * Processor Priority in conjunction with the TPR.
123          */
124         uint8_t                  isrvec_stk[ISRVEC_STK_SIZE];
125         int                      isrvec_stk_top;
126
127         uint64_t                msr_apicbase;
128         enum boot_state         boot_state;
129 };
130
131 /*
132  * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
133  * vlapic_callout_handler() and vcpu accesses to the following registers:
134  * - initial count register aka icr_timer
135  * - current count register aka ccr_timer
136  * - divide config register aka dcr_timer
137  * - timer LVT register
138  *
139  * Note that the vlapic_callout_handler() does not write to any of these
140  * registers so they can be safely read from the vcpu context without locking.
141  */
142 #define VLAPIC_TIMER_LOCK(vlapic)       mtx_lock_spin(&((vlapic)->timer_mtx))
143 #define VLAPIC_TIMER_UNLOCK(vlapic)     mtx_unlock_spin(&((vlapic)->timer_mtx))
144 #define VLAPIC_TIMER_LOCKED(vlapic)     mtx_owned(&((vlapic)->timer_mtx))
145
146 #define VLAPIC_BUS_FREQ tsc_freq
147
148 static __inline uint32_t
149 vlapic_get_id(struct vlapic *vlapic)
150 {
151
152         if (x2apic(vlapic))
153                 return (vlapic->vcpuid);
154         else
155                 return (vlapic->vcpuid << 24);
156 }
157
158 static __inline uint32_t
159 vlapic_get_ldr(struct vlapic *vlapic)
160 {
161         struct LAPIC *lapic;
162         int apicid;
163         uint32_t ldr;
164
165         lapic = &vlapic->apic;
166         if (x2apic(vlapic)) {
167                 apicid = vlapic_get_id(vlapic);
168                 ldr = 1 << (apicid & 0xf);
169                 ldr |= (apicid & 0xffff0) << 12;
170                 return (ldr);
171         } else
172                 return (lapic->ldr);
173 }
174
175 static __inline uint32_t
176 vlapic_get_dfr(struct vlapic *vlapic)
177 {
178         struct LAPIC *lapic;
179
180         lapic = &vlapic->apic;
181         if (x2apic(vlapic))
182                 return (0);
183         else
184                 return (lapic->dfr);
185 }
186
187 static void
188 vlapic_set_dfr(struct vlapic *vlapic, uint32_t data)
189 {
190         uint32_t dfr;
191         struct LAPIC *lapic;
192         
193         if (x2apic(vlapic)) {
194                 VM_CTR1(vlapic->vm, "write to DFR in x2apic mode: %#x", data);
195                 return;
196         }
197
198         lapic = &vlapic->apic;
199         dfr = (lapic->dfr & APIC_DFR_RESERVED) | (data & APIC_DFR_MODEL_MASK);
200         if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
201                 VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
202         else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
203                 VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
204         else
205                 VLAPIC_CTR1(vlapic, "vlapic DFR in Unknown Model %#x", dfr);
206
207         lapic->dfr = dfr;
208 }
209
210 static void
211 vlapic_set_ldr(struct vlapic *vlapic, uint32_t data)
212 {
213         struct LAPIC *lapic;
214
215         /* LDR is read-only in x2apic mode */
216         if (x2apic(vlapic)) {
217                 VLAPIC_CTR1(vlapic, "write to LDR in x2apic mode: %#x", data);
218                 return;
219         }
220
221         lapic = &vlapic->apic;
222         lapic->ldr = data & ~APIC_LDR_RESERVED;
223         VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
224 }
225
226 static int
227 vlapic_timer_divisor(uint32_t dcr)
228 {
229         switch (dcr & 0xB) {
230         case APIC_TDCR_1:
231                 return (1);
232         case APIC_TDCR_2:
233                 return (2);
234         case APIC_TDCR_4:
235                 return (4);
236         case APIC_TDCR_8:
237                 return (8);
238         case APIC_TDCR_16:
239                 return (16);
240         case APIC_TDCR_32:
241                 return (32);
242         case APIC_TDCR_64:
243                 return (64);
244         case APIC_TDCR_128:
245                 return (128);
246         default:
247                 panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
248         }
249 }
250
251 static void
252 vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
253 {
254         int i;
255         for (i = 0; i < num_lvt; i++) {
256                 *lvts |= APIC_LVT_M;
257                 lvts += 4;
258         }
259 }
260
261 #if 0
262 static inline void
263 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
264 {
265         printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
266             *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
267             *lvt & APIC_LVTT_M);
268 }
269 #endif
270
271 static uint32_t
272 vlapic_get_ccr(struct vlapic *vlapic)
273 {
274         struct bintime bt_now, bt_rem;
275         struct LAPIC *lapic;
276         uint32_t ccr;
277         
278         ccr = 0;
279         lapic = &vlapic->apic;
280
281         VLAPIC_TIMER_LOCK(vlapic);
282         if (callout_active(&vlapic->callout)) {
283                 /*
284                  * If the timer is scheduled to expire in the future then
285                  * compute the value of 'ccr' based on the remaining time.
286                  */
287                 binuptime(&bt_now);
288                 if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
289                         bt_rem = vlapic->timer_fire_bt;
290                         bintime_sub(&bt_rem, &bt_now);
291                         ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
292                         ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
293                 }
294         }
295         KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
296             "icr_timer is %#x", ccr, lapic->icr_timer));
297         VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
298             ccr, lapic->icr_timer);
299         VLAPIC_TIMER_UNLOCK(vlapic);
300         return (ccr);
301 }
302
303 static void
304 vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr)
305 {
306         struct LAPIC *lapic;
307         int divisor;
308         
309         lapic = &vlapic->apic;
310         VLAPIC_TIMER_LOCK(vlapic);
311
312         lapic->dcr_timer = dcr;
313         divisor = vlapic_timer_divisor(dcr);
314         VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", dcr, divisor);
315
316         /*
317          * Update the timer frequency and the timer period.
318          *
319          * XXX changes to the frequency divider will not take effect until
320          * the timer is reloaded.
321          */
322         FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
323         vlapic->timer_period_bt = vlapic->timer_freq_bt;
324         bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
325
326         VLAPIC_TIMER_UNLOCK(vlapic);
327 }
328
329 static void
330 vlapic_update_errors(struct vlapic *vlapic)
331 {
332         struct LAPIC    *lapic = &vlapic->apic;
333         lapic->esr = 0; // XXX 
334 }
335
336 static void
337 vlapic_init_ipi(struct vlapic *vlapic)
338 {
339         struct LAPIC    *lapic = &vlapic->apic;
340         lapic->version = VLAPIC_VERSION;
341         lapic->version |= (VLAPIC_MAXLVT_ENTRIES << MAXLVTSHIFT);
342         lapic->dfr = 0xffffffff;
343         lapic->svr = APIC_SVR_VECTOR;
344         vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
345 }
346
347 static int
348 vlapic_reset(struct vlapic *vlapic)
349 {
350         struct LAPIC    *lapic = &vlapic->apic;
351
352         memset(lapic, 0, sizeof(*lapic));
353         lapic->apr = vlapic->vcpuid;
354         vlapic_init_ipi(vlapic);
355         vlapic_set_dcr(vlapic, 0);
356
357         if (vlapic->vcpuid == 0)
358                 vlapic->boot_state = BS_RUNNING;        /* BSP */
359         else
360                 vlapic->boot_state = BS_INIT;           /* AP */
361         
362         return 0;
363
364 }
365
366 void
367 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
368 {
369         struct LAPIC    *lapic = &vlapic->apic;
370         uint32_t        *irrptr, *tmrptr, mask;
371         int             idx;
372
373         if (vector < 0 || vector >= 256)
374                 panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
375
376         if (!(lapic->svr & APIC_SVR_ENABLE)) {
377                 VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
378                     "interrupt %d", vector);
379                 return;
380         }
381
382         idx = (vector / 32) * 4;
383         mask = 1 << (vector % 32);
384
385         irrptr = &lapic->irr0;
386         atomic_set_int(&irrptr[idx], mask);
387
388         /*
389          * Upon acceptance of an interrupt into the IRR the corresponding
390          * TMR bit is cleared for edge-triggered interrupts and set for
391          * level-triggered interrupts.
392          */
393         tmrptr = &lapic->tmr0;
394         if (level)
395                 atomic_set_int(&tmrptr[idx], mask);
396         else
397                 atomic_clear_int(&tmrptr[idx], mask);
398
399         VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
400 }
401
402 static __inline uint32_t *
403 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
404 {
405         struct LAPIC    *lapic = &vlapic->apic;
406         int              i;
407
408         if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
409                 panic("vlapic_get_lvt: invalid LVT\n");
410         }
411         i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
412         return ((&lapic->lvt_timer) + i);;
413 }
414
415 static __inline uint32_t
416 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
417 {
418
419         return (*vlapic_get_lvtptr(vlapic, offset));
420 }
421
422 static void
423 vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val)
424 {
425         uint32_t *lvtptr;
426         struct LAPIC *lapic;
427         
428         lapic = &vlapic->apic;
429         lvtptr = vlapic_get_lvtptr(vlapic, offset);     
430
431         if (offset == APIC_OFFSET_TIMER_LVT)
432                 VLAPIC_TIMER_LOCK(vlapic);
433
434         if (!(lapic->svr & APIC_SVR_ENABLE))
435                 val |= APIC_LVT_M;
436         *lvtptr = val;
437
438         if (offset == APIC_OFFSET_TIMER_LVT)
439                 VLAPIC_TIMER_UNLOCK(vlapic);
440 }
441
442 #if 1
443 static void
444 dump_isrvec_stk(struct vlapic *vlapic)
445 {
446         int i;
447         uint32_t *isrptr;
448
449         isrptr = &vlapic->apic.isr0;
450         for (i = 0; i < 8; i++)
451                 printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
452
453         for (i = 0; i <= vlapic->isrvec_stk_top; i++)
454                 printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
455 }
456 #endif
457
458 /*
459  * Algorithm adopted from section "Interrupt, Task and Processor Priority"
460  * in Intel Architecture Manual Vol 3a.
461  */
462 static void
463 vlapic_update_ppr(struct vlapic *vlapic)
464 {
465         int isrvec, tpr, ppr;
466
467         /*
468          * Note that the value on the stack at index 0 is always 0.
469          *
470          * This is a placeholder for the value of ISRV when none of the
471          * bits is set in the ISRx registers.
472          */
473         isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
474         tpr = vlapic->apic.tpr;
475
476 #if 1
477         {
478                 int i, lastprio, curprio, vector, idx;
479                 uint32_t *isrptr;
480
481                 if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
482                         panic("isrvec_stk is corrupted: %d", isrvec);
483
484                 /*
485                  * Make sure that the priority of the nested interrupts is
486                  * always increasing.
487                  */
488                 lastprio = -1;
489                 for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
490                         curprio = PRIO(vlapic->isrvec_stk[i]);
491                         if (curprio <= lastprio) {
492                                 dump_isrvec_stk(vlapic);
493                                 panic("isrvec_stk does not satisfy invariant");
494                         }
495                         lastprio = curprio;
496                 }
497
498                 /*
499                  * Make sure that each bit set in the ISRx registers has a
500                  * corresponding entry on the isrvec stack.
501                  */
502                 i = 1;
503                 isrptr = &vlapic->apic.isr0;
504                 for (vector = 0; vector < 256; vector++) {
505                         idx = (vector / 32) * 4;
506                         if (isrptr[idx] & (1 << (vector % 32))) {
507                                 if (i > vlapic->isrvec_stk_top ||
508                                     vlapic->isrvec_stk[i] != vector) {
509                                         dump_isrvec_stk(vlapic);
510                                         panic("ISR and isrvec_stk out of sync");
511                                 }
512                                 i++;
513                         }
514                 }
515         }
516 #endif
517
518         if (PRIO(tpr) >= PRIO(isrvec))
519                 ppr = tpr;
520         else
521                 ppr = isrvec & 0xf0;
522
523         vlapic->apic.ppr = ppr;
524         VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
525 }
526
527 static void
528 vlapic_process_eoi(struct vlapic *vlapic)
529 {
530         struct LAPIC    *lapic = &vlapic->apic;
531         uint32_t        *isrptr, *tmrptr;
532         int             i, idx, bitpos, vector;
533
534         isrptr = &lapic->isr0;
535         tmrptr = &lapic->tmr0;
536
537         /*
538          * The x86 architecture reserves the the first 32 vectors for use
539          * by the processor.
540          */
541         for (i = 7; i > 0; i--) {
542                 idx = i * 4;
543                 bitpos = fls(isrptr[idx]);
544                 if (bitpos-- != 0) {
545                         if (vlapic->isrvec_stk_top <= 0) {
546                                 panic("invalid vlapic isrvec_stk_top %d",
547                                       vlapic->isrvec_stk_top);
548                         }
549                         isrptr[idx] &= ~(1 << bitpos);
550                         VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
551                         vlapic->isrvec_stk_top--;
552                         vlapic_update_ppr(vlapic);
553                         if ((tmrptr[idx] & (1 << bitpos)) != 0) {
554                                 vector = i * 32 + bitpos;
555                                 vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
556                                     vector);
557                         }
558                         return;
559                 }
560         }
561 }
562
563 static __inline int
564 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
565 {
566
567         return (lvt & mask);
568 }
569
570 static __inline int
571 vlapic_periodic_timer(struct vlapic *vlapic)
572 {
573         uint32_t lvt;
574         
575         lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
576
577         return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
578 }
579
580 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
581
582 static void
583 vlapic_fire_timer(struct vlapic *vlapic)
584 {
585         int vector;
586         uint32_t lvt;
587
588         KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
589         
590         lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
591
592         if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
593                 vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
594                 vector = vlapic_get_lvt_field(lvt, APIC_LVTT_VECTOR);
595                 vlapic_set_intr_ready(vlapic, vector, false);
596                 vcpu_notify_event(vlapic->vm, vlapic->vcpuid);
597         }
598 }
599
600 static void
601 vlapic_callout_handler(void *arg)
602 {
603         struct vlapic *vlapic;
604         struct bintime bt, btnow;
605         sbintime_t rem_sbt;
606
607         vlapic = arg;
608
609         VLAPIC_TIMER_LOCK(vlapic);
610         if (callout_pending(&vlapic->callout))  /* callout was reset */
611                 goto done;
612
613         if (!callout_active(&vlapic->callout))  /* callout was stopped */
614                 goto done;
615
616         callout_deactivate(&vlapic->callout);
617
618         KASSERT(vlapic->apic.icr_timer != 0, ("vlapic timer is disabled"));
619
620         vlapic_fire_timer(vlapic);
621
622         if (vlapic_periodic_timer(vlapic)) {
623                 binuptime(&btnow);
624                 KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
625                     ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
626                     btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
627                     vlapic->timer_fire_bt.frac));
628
629                 /*
630                  * Compute the delta between when the timer was supposed to
631                  * fire and the present time.
632                  */
633                 bt = btnow;
634                 bintime_sub(&bt, &vlapic->timer_fire_bt);
635
636                 rem_sbt = bttosbt(vlapic->timer_period_bt);
637                 if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
638                         /*
639                          * Adjust the time until the next countdown downward
640                          * to account for the lost time.
641                          */
642                         rem_sbt -= bttosbt(bt);
643                 } else {
644                         /*
645                          * If the delta is greater than the timer period then
646                          * just reset our time base instead of trying to catch
647                          * up.
648                          */
649                         vlapic->timer_fire_bt = btnow;
650                         VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
651                             "usecs, period is %lu usecs - resetting time base",
652                             bttosbt(bt) / SBT_1US,
653                             bttosbt(vlapic->timer_period_bt) / SBT_1US);
654                 }
655
656                 bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
657                 callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
658                     vlapic_callout_handler, vlapic, 0);
659         }
660 done:
661         VLAPIC_TIMER_UNLOCK(vlapic);
662 }
663
664 static void
665 vlapic_set_icr_timer(struct vlapic *vlapic, uint32_t icr_timer)
666 {
667         struct LAPIC *lapic;
668         sbintime_t sbt;
669
670         VLAPIC_TIMER_LOCK(vlapic);
671
672         lapic = &vlapic->apic;
673         lapic->icr_timer = icr_timer;
674
675         vlapic->timer_period_bt = vlapic->timer_freq_bt;
676         bintime_mul(&vlapic->timer_period_bt, icr_timer);
677
678         if (icr_timer != 0) {
679                 binuptime(&vlapic->timer_fire_bt);
680                 bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
681
682                 sbt = bttosbt(vlapic->timer_period_bt);
683                 callout_reset_sbt(&vlapic->callout, sbt, 0,
684                     vlapic_callout_handler, vlapic, 0);
685         } else
686                 callout_stop(&vlapic->callout);
687
688         VLAPIC_TIMER_UNLOCK(vlapic);
689 }
690
691 /*
692  * This function populates 'dmask' with the set of vcpus that match the
693  * addressing specified by the (dest, phys, lowprio) tuple.
694  * 
695  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
696  * or xAPIC (8-bit) destination field.
697  */
698 static void
699 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
700     bool lowprio, bool x2apic_dest)
701 {
702         struct vlapic *vlapic;
703         uint32_t dfr, ldr, ldest, cluster;
704         uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
705         cpuset_t amask;
706         int vcpuid;
707
708         if ((x2apic_dest && dest == 0xffffffff) ||
709             (!x2apic_dest && dest == 0xff)) {
710                 /*
711                  * Broadcast in both logical and physical modes.
712                  */
713                 *dmask = vm_active_cpus(vm);
714                 return;
715         }
716
717         if (phys) {
718                 /*
719                  * Physical mode: destination is APIC ID.
720                  */
721                 CPU_ZERO(dmask);
722                 vcpuid = vm_apicid2vcpuid(vm, dest);
723                 if (vcpuid < VM_MAXCPU)
724                         CPU_SET(vcpuid, dmask);
725         } else {
726                 /*
727                  * In the "Flat Model" the MDA is interpreted as an 8-bit wide
728                  * bitmask. This model is only avilable in the xAPIC mode.
729                  */
730                 mda_flat_ldest = dest & 0xff;
731
732                 /*
733                  * In the "Cluster Model" the MDA is used to identify a
734                  * specific cluster and a set of APICs in that cluster.
735                  */
736                 if (x2apic_dest) {
737                         mda_cluster_id = dest >> 16;
738                         mda_cluster_ldest = dest & 0xffff;
739                 } else {
740                         mda_cluster_id = (dest >> 4) & 0xf;
741                         mda_cluster_ldest = dest & 0xf;
742                 }
743
744                 /*
745                  * Logical mode: match each APIC that has a bit set
746                  * in it's LDR that matches a bit in the ldest.
747                  */
748                 CPU_ZERO(dmask);
749                 amask = vm_active_cpus(vm);
750                 while ((vcpuid = CPU_FFS(&amask)) != 0) {
751                         vcpuid--;
752                         CPU_CLR(vcpuid, &amask);
753
754                         vlapic = vm_lapic(vm, vcpuid);
755                         dfr = vlapic_get_dfr(vlapic);
756                         ldr = vlapic_get_ldr(vlapic);
757
758                         if ((dfr & APIC_DFR_MODEL_MASK) ==
759                             APIC_DFR_MODEL_FLAT) {
760                                 ldest = ldr >> 24;
761                                 mda_ldest = mda_flat_ldest;
762                         } else if ((dfr & APIC_DFR_MODEL_MASK) ==
763                             APIC_DFR_MODEL_CLUSTER) {
764                                 if (x2apic(vlapic)) {
765                                         cluster = ldr >> 16;
766                                         ldest = ldr & 0xffff;
767                                 } else {
768                                         cluster = ldr >> 28;
769                                         ldest = (ldr >> 24) & 0xf;
770                                 }
771                                 if (cluster != mda_cluster_id)
772                                         continue;
773                                 mda_ldest = mda_cluster_ldest;
774                         } else {
775                                 /*
776                                  * Guest has configured a bad logical
777                                  * model for this vcpu - skip it.
778                                  */
779                                 VLAPIC_CTR1(vlapic, "vlapic has bad logical "
780                                     "model %x - cannot deliver interrupt", dfr);
781                                 continue;
782                         }
783
784                         if ((mda_ldest & ldest) != 0) {
785                                 CPU_SET(vcpuid, dmask);
786                                 if (lowprio)
787                                         break;
788                         }
789                 }
790         }
791 }
792
793 static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
794
795 static int
796 lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu)
797 {
798         int i;
799         bool phys;
800         cpuset_t dmask;
801         uint32_t dest, vec, mode;
802         struct vlapic *vlapic2;
803         struct vm_exit *vmexit;
804         
805         if (x2apic(vlapic))
806                 dest = icrval >> 32;
807         else
808                 dest = icrval >> (32 + 24);
809         vec = icrval & APIC_VECTOR_MASK;
810         mode = icrval & APIC_DELMODE_MASK;
811
812         if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
813                 switch (icrval & APIC_DEST_MASK) {
814                 case APIC_DEST_DESTFLD:
815                         phys = ((icrval & APIC_DESTMODE_LOG) == 0);
816                         vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
817                             x2apic(vlapic));
818                         break;
819                 case APIC_DEST_SELF:
820                         CPU_SETOF(vlapic->vcpuid, &dmask);
821                         break;
822                 case APIC_DEST_ALLISELF:
823                         dmask = vm_active_cpus(vlapic->vm);
824                         break;
825                 case APIC_DEST_ALLESELF:
826                         dmask = vm_active_cpus(vlapic->vm);
827                         CPU_CLR(vlapic->vcpuid, &dmask);
828                         break;
829                 default:
830                         CPU_ZERO(&dmask);       /* satisfy gcc */
831                         break;
832                 }
833
834                 while ((i = CPU_FFS(&dmask)) != 0) {
835                         i--;
836                         CPU_CLR(i, &dmask);
837                         if (mode == APIC_DELMODE_FIXED) {
838                                 lapic_intr_edge(vlapic->vm, i, vec);
839                                 vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
840                                                     IPIS_SENT, i, 1);
841                         } else
842                                 vm_inject_nmi(vlapic->vm, i);
843                 }
844
845                 return (0);     /* handled completely in the kernel */
846         }
847
848         if (mode == APIC_DELMODE_INIT) {
849                 if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
850                         return (0);
851
852                 if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
853                         vlapic2 = vm_lapic(vlapic->vm, dest);
854
855                         /* move from INIT to waiting-for-SIPI state */
856                         if (vlapic2->boot_state == BS_INIT) {
857                                 vlapic2->boot_state = BS_SIPI;
858                         }
859
860                         return (0);
861                 }
862         }
863
864         if (mode == APIC_DELMODE_STARTUP) {
865                 if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
866                         vlapic2 = vm_lapic(vlapic->vm, dest);
867
868                         /*
869                          * Ignore SIPIs in any state other than wait-for-SIPI
870                          */
871                         if (vlapic2->boot_state != BS_SIPI)
872                                 return (0);
873
874                         /*
875                          * XXX this assumes that the startup IPI always succeeds
876                          */
877                         vlapic2->boot_state = BS_RUNNING;
878                         vm_activate_cpu(vlapic2->vm, dest);
879
880                         *retu = true;
881                         vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
882                         vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
883                         vmexit->u.spinup_ap.vcpu = dest;
884                         vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
885
886                         return (0);
887                 }
888         }
889
890         /*
891          * This will cause a return to userland.
892          */
893         return (1);
894 }
895
896 int
897 vlapic_pending_intr(struct vlapic *vlapic)
898 {
899         struct LAPIC    *lapic = &vlapic->apic;
900         int              idx, i, bitpos, vector;
901         uint32_t        *irrptr, val;
902
903         irrptr = &lapic->irr0;
904
905         /*
906          * The x86 architecture reserves the the first 32 vectors for use
907          * by the processor.
908          */
909         for (i = 7; i > 0; i--) {
910                 idx = i * 4;
911                 val = atomic_load_acq_int(&irrptr[idx]);
912                 bitpos = fls(val);
913                 if (bitpos != 0) {
914                         vector = i * 32 + (bitpos - 1);
915                         if (PRIO(vector) > PRIO(lapic->ppr)) {
916                                 VLAPIC_CTR1(vlapic, "pending intr %d", vector);
917                                 return (vector);
918                         } else 
919                                 break;
920                 }
921         }
922         return (-1);
923 }
924
925 void
926 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
927 {
928         struct LAPIC    *lapic = &vlapic->apic;
929         uint32_t        *irrptr, *isrptr;
930         int             idx, stk_top;
931
932         /*
933          * clear the ready bit for vector being accepted in irr 
934          * and set the vector as in service in isr.
935          */
936         idx = (vector / 32) * 4;
937
938         irrptr = &lapic->irr0;
939         atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
940         VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
941
942         isrptr = &lapic->isr0;
943         isrptr[idx] |= 1 << (vector % 32);
944         VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
945
946         /*
947          * Update the PPR
948          */
949         vlapic->isrvec_stk_top++;
950
951         stk_top = vlapic->isrvec_stk_top;
952         if (stk_top >= ISRVEC_STK_SIZE)
953                 panic("isrvec_stk_top overflow %d", stk_top);
954
955         vlapic->isrvec_stk[stk_top] = vector;
956         vlapic_update_ppr(vlapic);
957 }
958
959 static void
960 lapic_set_svr(struct vlapic *vlapic, uint32_t new)
961 {
962         struct LAPIC *lapic;
963         uint32_t old, changed;
964
965         lapic = &vlapic->apic;
966         old = lapic->svr;
967         changed = old ^ new;
968         if ((changed & APIC_SVR_ENABLE) != 0) {
969                 if ((new & APIC_SVR_ENABLE) == 0) {
970                         /*
971                          * The apic is now disabled so stop the apic timer.
972                          */
973                         VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
974                         VLAPIC_TIMER_LOCK(vlapic);
975                         callout_stop(&vlapic->callout);
976                         VLAPIC_TIMER_UNLOCK(vlapic);
977                 } else {
978                         /*
979                          * The apic is now enabled so restart the apic timer
980                          * if it is configured in periodic mode.
981                          */
982                         VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
983                         if (vlapic_periodic_timer(vlapic))
984                                 vlapic_set_icr_timer(vlapic, lapic->icr_timer);
985                 }
986         }
987         lapic->svr = new;
988 }
989
990 int
991 vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu)
992 {
993         struct LAPIC    *lapic = &vlapic->apic;
994         uint32_t        *reg;
995         int              i;
996
997         if (offset > sizeof(*lapic)) {
998                 *data = 0;
999                 goto done;
1000         }
1001         
1002         offset &= ~3;
1003         switch(offset)
1004         {
1005                 case APIC_OFFSET_ID:
1006                         *data = vlapic_get_id(vlapic);
1007                         break;
1008                 case APIC_OFFSET_VER:
1009                         *data = lapic->version;
1010                         break;
1011                 case APIC_OFFSET_TPR:
1012                         *data = lapic->tpr;
1013                         break;
1014                 case APIC_OFFSET_APR:
1015                         *data = lapic->apr;
1016                         break;
1017                 case APIC_OFFSET_PPR:
1018                         *data = lapic->ppr;
1019                         break;
1020                 case APIC_OFFSET_EOI:
1021                         *data = lapic->eoi;
1022                         break;
1023                 case APIC_OFFSET_LDR:
1024                         *data = vlapic_get_ldr(vlapic);
1025                         break;
1026                 case APIC_OFFSET_DFR:
1027                         *data = vlapic_get_dfr(vlapic);
1028                         break;
1029                 case APIC_OFFSET_SVR:
1030                         *data = lapic->svr;
1031                         break;
1032                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1033                         i = (offset - APIC_OFFSET_ISR0) >> 2;
1034                         reg = &lapic->isr0;
1035                         *data = *(reg + i);
1036                         break;
1037                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1038                         i = (offset - APIC_OFFSET_TMR0) >> 2;
1039                         reg = &lapic->tmr0;
1040                         *data = *(reg + i);
1041                         break;
1042                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1043                         i = (offset - APIC_OFFSET_IRR0) >> 2;
1044                         reg = &lapic->irr0;
1045                         *data = atomic_load_acq_int(reg + i);
1046                         break;
1047                 case APIC_OFFSET_ESR:
1048                         *data = lapic->esr;
1049                         break;
1050                 case APIC_OFFSET_ICR_LOW: 
1051                         *data = lapic->icr_lo;
1052                         break;
1053                 case APIC_OFFSET_ICR_HI: 
1054                         *data = lapic->icr_hi;
1055                         break;
1056                 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1057                         *data = vlapic_get_lvt(vlapic, offset); 
1058                         break;
1059                 case APIC_OFFSET_ICR:
1060                         *data = lapic->icr_timer;
1061                         break;
1062                 case APIC_OFFSET_CCR:
1063                         *data = vlapic_get_ccr(vlapic);
1064                         break;
1065                 case APIC_OFFSET_DCR:
1066                         *data = lapic->dcr_timer;
1067                         break;
1068                 case APIC_OFFSET_RRR:
1069                 default:
1070                         *data = 0;
1071                         break;
1072         }
1073 done:
1074         VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
1075         return 0;
1076 }
1077
1078 int
1079 vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu)
1080 {
1081         struct LAPIC    *lapic = &vlapic->apic;
1082         int             retval;
1083
1084         VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data);
1085
1086         if (offset > sizeof(*lapic)) {
1087                 return 0;
1088         }
1089
1090         retval = 0;
1091         offset &= ~3;
1092         switch(offset)
1093         {
1094                 case APIC_OFFSET_ID:
1095                         break;
1096                 case APIC_OFFSET_TPR:
1097                         lapic->tpr = data & 0xff;
1098                         vlapic_update_ppr(vlapic);
1099                         break;
1100                 case APIC_OFFSET_EOI:
1101                         vlapic_process_eoi(vlapic);
1102                         break;
1103                 case APIC_OFFSET_LDR:
1104                         vlapic_set_ldr(vlapic, data);
1105                         break;
1106                 case APIC_OFFSET_DFR:
1107                         vlapic_set_dfr(vlapic, data);
1108                         break;
1109                 case APIC_OFFSET_SVR:
1110                         lapic_set_svr(vlapic, data);
1111                         break;
1112                 case APIC_OFFSET_ICR_LOW: 
1113                         if (!x2apic(vlapic)) {
1114                                 data &= 0xffffffff;
1115                                 data |= (uint64_t)lapic->icr_hi << 32;
1116                         }
1117                         retval = lapic_process_icr(vlapic, data, retu);
1118                         break;
1119                 case APIC_OFFSET_ICR_HI:
1120                         if (!x2apic(vlapic)) {
1121                                 retval = 0;
1122                                 lapic->icr_hi = data;
1123                         }
1124                         break;
1125                 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1126                         vlapic_set_lvt(vlapic, offset, data);
1127                         break;
1128                 case APIC_OFFSET_ICR:
1129                         vlapic_set_icr_timer(vlapic, data);
1130                         break;
1131
1132                 case APIC_OFFSET_DCR:
1133                         vlapic_set_dcr(vlapic, data);
1134                         break;
1135
1136                 case APIC_OFFSET_ESR:
1137                         vlapic_update_errors(vlapic);
1138                         break;
1139                 case APIC_OFFSET_VER:
1140                 case APIC_OFFSET_APR:
1141                 case APIC_OFFSET_PPR:
1142                 case APIC_OFFSET_RRR:
1143                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1144                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1145                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1146                 case APIC_OFFSET_CCR:
1147                 default:
1148                         // Read only.
1149                         break;
1150         }
1151
1152         return (retval);
1153 }
1154
1155 struct vlapic *
1156 vlapic_init(struct vm *vm, int vcpuid)
1157 {
1158         struct vlapic           *vlapic;
1159
1160         vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
1161         vlapic->vm = vm;
1162         vlapic->vcpuid = vcpuid;
1163
1164         /*
1165          * If the vlapic is configured in x2apic mode then it will be
1166          * accessed in the critical section via the MSR emulation code.
1167          *
1168          * Therefore the timer mutex must be a spinlock because blockable
1169          * mutexes cannot be acquired in a critical section.
1170          */
1171         mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
1172         callout_init(&vlapic->callout, 1);
1173
1174         vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
1175
1176         if (vcpuid == 0)
1177                 vlapic->msr_apicbase |= APICBASE_BSP;
1178
1179         vlapic_reset(vlapic);
1180
1181         return (vlapic);
1182 }
1183
1184 void
1185 vlapic_cleanup(struct vlapic *vlapic)
1186 {
1187
1188         callout_drain(&vlapic->callout);
1189         free(vlapic, M_VLAPIC);
1190 }
1191
1192 uint64_t
1193 vlapic_get_apicbase(struct vlapic *vlapic)
1194 {
1195
1196         return (vlapic->msr_apicbase);
1197 }
1198
1199 void
1200 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
1201 {
1202         int err;
1203         enum x2apic_state state;
1204
1205         err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
1206         if (err)
1207                 panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
1208
1209         if (state == X2APIC_DISABLED)
1210                 val &= ~APICBASE_X2APIC;
1211
1212         vlapic->msr_apicbase = val;
1213 }
1214
1215 void
1216 vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1217 {
1218         struct vlapic *vlapic;
1219
1220         vlapic = vm_lapic(vm, vcpuid);
1221
1222         if (state == X2APIC_DISABLED)
1223                 vlapic->msr_apicbase &= ~APICBASE_X2APIC;
1224 }
1225
1226 void
1227 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
1228     int delmode, int vec)
1229 {
1230         bool lowprio;
1231         int vcpuid;
1232         cpuset_t dmask;
1233
1234         if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
1235                 VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
1236                 return;
1237         }
1238         lowprio = (delmode == APIC_DELMODE_LOWPRIO);
1239
1240         /*
1241          * We don't provide any virtual interrupt redirection hardware so
1242          * all interrupts originating from the ioapic or MSI specify the
1243          * 'dest' in the legacy xAPIC format.
1244          */
1245         vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
1246
1247         while ((vcpuid = CPU_FFS(&dmask)) != 0) {
1248                 vcpuid--;
1249                 CPU_CLR(vcpuid, &dmask);
1250                 lapic_set_intr(vm, vcpuid, vec, level);
1251         }
1252 }
1253
1254 bool
1255 vlapic_enabled(struct vlapic *vlapic)
1256 {
1257         struct LAPIC *lapic = &vlapic->apic;
1258
1259         if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
1260             (lapic->svr & APIC_SVR_ENABLE) != 0)
1261                 return (true);
1262         else
1263                 return (false);
1264 }