sys/x86/x86/mca.c

   1 /*-
   2  * Copyright (c) 2009 Hudson River Trading LLC
   3  * Written by: John H. Baldwin <jhb@FreeBSD.org>
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  */
  27
  28 /*
  29  * Support for x86 machine check architecture.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #ifdef __amd64__
  36 #define DEV_APIC
  37 #else
  38 #include "opt_apic.h"
  39 #endif
  40
  41 #include <sys/param.h>
  42 #include <sys/bus.h>
  43 #include <sys/interrupt.h>
  44 #include <sys/kernel.h>
  45 #include <sys/lock.h>
  46 #include <sys/malloc.h>
  47 #include <sys/mutex.h>
  48 #include <sys/proc.h>
  49 #include <sys/sched.h>
  50 #include <sys/smp.h>
  51 #include <sys/sysctl.h>
  52 #include <sys/systm.h>
  53 #include <sys/taskqueue.h>
  54 #include <machine/intr_machdep.h>
  55 #include <x86/apicvar.h>
  56 #include <machine/cpu.h>
  57 #include <machine/cputypes.h>
  58 #include <x86/mca.h>
  59 #include <machine/md_var.h>
  60 #include <machine/specialreg.h>
  61
  62 /* Modes for mca_scan() */
  63 enum scan_mode {
  64         POLLED,
  65         MCE,
  66         CMCI,
  67 };
  68
  69 #ifdef DEV_APIC
  70 /*
  71  * State maintained for each monitored MCx bank to control the
  72  * corrected machine check interrupt threshold.
  73  */
  74 struct cmc_state {
  75         int     max_threshold;
  76         time_t  last_intr;
  77 };
  78
  79 struct amd_et_state {
  80         int     cur_threshold;
  81         time_t  last_intr;
  82 };
  83 #endif
  84
  85 struct mca_internal {
  86         struct mca_record rec;
  87         int             logged;
  88         STAILQ_ENTRY(mca_internal) link;
  89 };
  90
  91 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
  92
  93 static volatile int mca_count;  /* Number of records stored. */
  94 static int mca_banks;           /* Number of per-CPU register banks. */
  95
  96 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL,
  97     "Machine Check Architecture");
  98
  99 static int mca_enabled = 1;
 100 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
 101     "Administrative toggle for machine check support");
 102
 103 static int amd10h_L1TP = 1;
 104 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
 105     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
 106
 107 static int intel6h_HSD131;
 108 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
 109     "Administrative toggle for logging of spurious corrected errors");
 110
 111 int workaround_erratum383;
 112 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
 113     &workaround_erratum383, 0,
 114     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
 115
 116 static STAILQ_HEAD(, mca_internal) mca_freelist;
 117 static int mca_freecount;
 118 static STAILQ_HEAD(, mca_internal) mca_records;
 119 static struct callout mca_timer;
 120 static int mca_ticks = 3600;    /* Check hourly by default. */
 121 static struct taskqueue *mca_tq;
 122 static struct task mca_refill_task, mca_scan_task;
 123 static struct mtx mca_lock;
 124
 125 #ifdef DEV_APIC
 126 static struct cmc_state **cmc_state;            /* Indexed by cpuid, bank. */
 127 static struct amd_et_state *amd_et_state;       /* Indexed by cpuid. */
 128 static int cmc_throttle = 60;   /* Time in seconds to throttle CMCI. */
 129
 130 static int amd_elvt = -1;
 131
 132 static inline bool
 133 amd_thresholding_supported(void)
 134 {
 135         return (cpu_vendor_id == CPU_VENDOR_AMD &&
 136             CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16);
 137 }
 138 #endif
 139
 140 static int
 141 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
 142 {
 143         int error, value;
 144
 145         value = *(int *)arg1;
 146         error = sysctl_handle_int(oidp, &value, 0, req);
 147         if (error || req->newptr == NULL)
 148                 return (error);
 149         if (value <= 0)
 150                 return (EINVAL);
 151         *(int *)arg1 = value;
 152         return (0);
 153 }
 154
 155 static int
 156 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
 157 {
 158         int *name = (int *)arg1;
 159         u_int namelen = arg2;
 160         struct mca_record record;
 161         struct mca_internal *rec;
 162         int i;
 163
 164         if (namelen != 1)
 165                 return (EINVAL);
 166
 167         if (name[0] < 0 || name[0] >= mca_count)
 168                 return (EINVAL);
 169
 170         mtx_lock_spin(&mca_lock);
 171         if (name[0] >= mca_count) {
 172                 mtx_unlock_spin(&mca_lock);
 173                 return (EINVAL);
 174         }
 175         i = 0;
 176         STAILQ_FOREACH(rec, &mca_records, link) {
 177                 if (i == name[0]) {
 178                         record = rec->rec;
 179                         break;
 180                 }
 181                 i++;
 182         }
 183         mtx_unlock_spin(&mca_lock);
 184         return (SYSCTL_OUT(req, &record, sizeof(record)));
 185 }
 186
 187 static const char *
 188 mca_error_ttype(uint16_t mca_error)
 189 {
 190
 191         switch ((mca_error & 0x000c) >> 2) {
 192         case 0:
 193                 return ("I");
 194         case 1:
 195                 return ("D");
 196         case 2:
 197                 return ("G");
 198         }
 199         return ("?");
 200 }
 201
 202 static const char *
 203 mca_error_level(uint16_t mca_error)
 204 {
 205
 206         switch (mca_error & 0x0003) {
 207         case 0:
 208                 return ("L0");
 209         case 1:
 210                 return ("L1");
 211         case 2:
 212                 return ("L2");
 213         case 3:
 214                 return ("LG");
 215         }
 216         return ("L?");
 217 }
 218
 219 static const char *
 220 mca_error_request(uint16_t mca_error)
 221 {
 222
 223         switch ((mca_error & 0x00f0) >> 4) {
 224         case 0x0:
 225                 return ("ERR");
 226         case 0x1:
 227                 return ("RD");
 228         case 0x2:
 229                 return ("WR");
 230         case 0x3:
 231                 return ("DRD");
 232         case 0x4:
 233                 return ("DWR");
 234         case 0x5:
 235                 return ("IRD");
 236         case 0x6:
 237                 return ("PREFETCH");
 238         case 0x7:
 239                 return ("EVICT");
 240         case 0x8:
 241                 return ("SNOOP");
 242         }
 243         return ("???");
 244 }
 245
 246 static const char *
 247 mca_error_mmtype(uint16_t mca_error)
 248 {
 249
 250         switch ((mca_error & 0x70) >> 4) {
 251         case 0x0:
 252                 return ("GEN");
 253         case 0x1:
 254                 return ("RD");
 255         case 0x2:
 256                 return ("WR");
 257         case 0x3:
 258                 return ("AC");
 259         case 0x4:
 260                 return ("MS");
 261         }
 262         return ("???");
 263 }
 264
 265 static int
 266 mca_mute(const struct mca_record *rec)
 267 {
 268
 269         /*
 270          * Skip spurious corrected parity errors generated by Intel Haswell-
 271          * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
 272          * erratum respectively), unless reporting is enabled.
 273          * Note that these errors also have been observed with the D0-stepping
 274          * of Haswell, while at least initially the CPU specification updates
 275          * suggested only the C0-stepping to be affected.  Similarly, Celeron
 276          * 2955U with a CPU ID of 0x45 apparently are also concerned with the
 277          * same problem, with HSM142 only referring to 0x3c and 0x46.
 278          */
 279         if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 280             CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 281             (CPUID_TO_MODEL(cpu_id) == 0x3c ||  /* HSD131, HSM142, HSW131 */
 282             CPUID_TO_MODEL(cpu_id) == 0x3d ||   /* BDM48 */
 283             CPUID_TO_MODEL(cpu_id) == 0x45 ||
 284             CPUID_TO_MODEL(cpu_id) == 0x46) &&  /* HSM142 */
 285             rec->mr_bank == 0 &&
 286             (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
 287             !intel6h_HSD131)
 288                 return (1);
 289
 290         return (0);
 291 }
 292
 293 /* Dump details about a single machine check. */
 294 static void
 295 mca_log(const struct mca_record *rec)
 296 {
 297         uint16_t mca_error;
 298
 299         if (mca_mute(rec))
 300                 return;
 301
 302         printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
 303             (long long)rec->mr_status);
 304         printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
 305             (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
 306         printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
 307             rec->mr_cpu_id, rec->mr_apic_id);
 308         printf("MCA: CPU %d ", rec->mr_cpu);
 309         if (rec->mr_status & MC_STATUS_UC)
 310                 printf("UNCOR ");
 311         else {
 312                 printf("COR ");
 313                 if (rec->mr_mcg_cap & MCG_CAP_CMCI_P)
 314                         printf("(%lld) ", ((long long)rec->mr_status &
 315                             MC_STATUS_COR_COUNT) >> 38);
 316         }
 317         if (rec->mr_status & MC_STATUS_PCC)
 318                 printf("PCC ");
 319         if (rec->mr_status & MC_STATUS_OVER)
 320                 printf("OVER ");
 321         mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
 322         switch (mca_error) {
 323                 /* Simple error codes. */
 324         case 0x0000:
 325                 printf("no error");
 326                 break;
 327         case 0x0001:
 328                 printf("unclassified error");
 329                 break;
 330         case 0x0002:
 331                 printf("ucode ROM parity error");
 332                 break;
 333         case 0x0003:
 334                 printf("external error");
 335                 break;
 336         case 0x0004:
 337                 printf("FRC error");
 338                 break;
 339         case 0x0005:
 340                 printf("internal parity error");
 341                 break;
 342         case 0x0400:
 343                 printf("internal timer error");
 344                 break;
 345         default:
 346                 if ((mca_error & 0xfc00) == 0x0400) {
 347                         printf("internal error %x", mca_error & 0x03ff);
 348                         break;
 349                 }
 350
 351                 /* Compound error codes. */
 352
 353                 /* Memory hierarchy error. */
 354                 if ((mca_error & 0xeffc) == 0x000c) {
 355                         printf("%s memory error", mca_error_level(mca_error));
 356                         break;
 357                 }
 358
 359                 /* TLB error. */
 360                 if ((mca_error & 0xeff0) == 0x0010) {
 361                         printf("%sTLB %s error", mca_error_ttype(mca_error),
 362                             mca_error_level(mca_error));
 363                         break;
 364                 }
 365
 366                 /* Memory controller error. */
 367                 if ((mca_error & 0xef80) == 0x0080) {
 368                         printf("%s channel ", mca_error_mmtype(mca_error));
 369                         if ((mca_error & 0x000f) != 0x000f)
 370                                 printf("%d", mca_error & 0x000f);
 371                         else
 372                                 printf("??");
 373                         printf(" memory error");
 374                         break;
 375                 }
 376
 377                 /* Cache error. */
 378                 if ((mca_error & 0xef00) == 0x0100) {
 379                         printf("%sCACHE %s %s error",
 380                             mca_error_ttype(mca_error),
 381                             mca_error_level(mca_error),
 382                             mca_error_request(mca_error));
 383                         break;
 384                 }
 385
 386                 /* Bus and/or Interconnect error. */
 387                 if ((mca_error & 0xe800) == 0x0800) {
 388                         printf("BUS%s ", mca_error_level(mca_error));
 389                         switch ((mca_error & 0x0600) >> 9) {
 390                         case 0:
 391                                 printf("Source");
 392                                 break;
 393                         case 1:
 394                                 printf("Responder");
 395                                 break;
 396                         case 2:
 397                                 printf("Observer");
 398                                 break;
 399                         default:
 400                                 printf("???");
 401                                 break;
 402                         }
 403                         printf(" %s ", mca_error_request(mca_error));
 404                         switch ((mca_error & 0x000c) >> 2) {
 405                         case 0:
 406                                 printf("Memory");
 407                                 break;
 408                         case 2:
 409                                 printf("I/O");
 410                                 break;
 411                         case 3:
 412                                 printf("Other");
 413                                 break;
 414                         default:
 415                                 printf("???");
 416                                 break;
 417                         }
 418                         if (mca_error & 0x0100)
 419                                 printf(" timed out");
 420                         break;
 421                 }
 422
 423                 printf("unknown error %x", mca_error);
 424                 break;
 425         }
 426         printf("\n");
 427         if (rec->mr_status & MC_STATUS_ADDRV)
 428                 printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
 429         if (rec->mr_status & MC_STATUS_MISCV)
 430                 printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
 431 }
 432
 433 static int
 434 mca_check_status(int bank, struct mca_record *rec)
 435 {
 436         uint64_t status;
 437         u_int p[4];
 438
 439         status = rdmsr(MSR_MC_STATUS(bank));
 440         if (!(status & MC_STATUS_VAL))
 441                 return (0);
 442
 443         /* Save exception information. */
 444         rec->mr_status = status;
 445         rec->mr_bank = bank;
 446         rec->mr_addr = 0;
 447         if (status & MC_STATUS_ADDRV)
 448                 rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
 449         rec->mr_misc = 0;
 450         if (status & MC_STATUS_MISCV)
 451                 rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
 452         rec->mr_tsc = rdtsc();
 453         rec->mr_apic_id = PCPU_GET(apic_id);
 454         rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
 455         rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
 456         rec->mr_cpu_id = cpu_id;
 457         rec->mr_cpu_vendor_id = cpu_vendor_id;
 458         rec->mr_cpu = PCPU_GET(cpuid);
 459
 460         /*
 461          * Clear machine check.  Don't do this for uncorrectable
 462          * errors so that the BIOS can see them.
 463          */
 464         if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
 465                 wrmsr(MSR_MC_STATUS(bank), 0);
 466                 do_cpuid(0, p);
 467         }
 468         return (1);
 469 }
 470
 471 static void
 472 mca_fill_freelist(void)
 473 {
 474         struct mca_internal *rec;
 475         int desired;
 476
 477         /*
 478          * Ensure we have at least one record for each bank and one
 479          * record per CPU.
 480          */
 481         desired = imax(mp_ncpus, mca_banks);
 482         mtx_lock_spin(&mca_lock);
 483         while (mca_freecount < desired) {
 484                 mtx_unlock_spin(&mca_lock);
 485                 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
 486                 mtx_lock_spin(&mca_lock);
 487                 STAILQ_INSERT_TAIL(&mca_freelist, rec, link);
 488                 mca_freecount++;
 489         }
 490         mtx_unlock_spin(&mca_lock);
 491 }
 492
 493 static void
 494 mca_refill(void *context, int pending)
 495 {
 496
 497         mca_fill_freelist();
 498 }
 499
 500 static void
 501 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
 502 {
 503         struct mca_internal *rec;
 504
 505         if (mode == POLLED) {
 506                 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
 507                 mtx_lock_spin(&mca_lock);
 508         } else {
 509                 mtx_lock_spin(&mca_lock);
 510                 rec = STAILQ_FIRST(&mca_freelist);
 511                 if (rec == NULL) {
 512                         printf("MCA: Unable to allocate space for an event.\n");
 513                         mca_log(record);
 514                         mtx_unlock_spin(&mca_lock);
 515                         return;
 516                 }
 517                 STAILQ_REMOVE_HEAD(&mca_freelist, link);
 518                 mca_freecount--;
 519         }
 520
 521         rec->rec = *record;
 522         rec->logged = 0;
 523         STAILQ_INSERT_TAIL(&mca_records, rec, link);
 524         mca_count++;
 525         mtx_unlock_spin(&mca_lock);
 526         if (mode == CMCI && !cold)
 527                 taskqueue_enqueue(mca_tq, &mca_refill_task);
 528 }
 529
 530 #ifdef DEV_APIC
 531 /*
 532  * Update the interrupt threshold for a CMCI.  The strategy is to use
 533  * a low trigger that interrupts as soon as the first event occurs.
 534  * However, if a steady stream of events arrive, the threshold is
 535  * increased until the interrupts are throttled to once every
 536  * cmc_throttle seconds or the periodic scan.  If a periodic scan
 537  * finds that the threshold is too high, it is lowered.
 538  */
 539 static int
 540 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
 541     int cur_threshold, int max_threshold)
 542 {
 543         u_int delta;
 544         int limit;
 545
 546         delta = (u_int)(time_uptime - last_intr);
 547         limit = cur_threshold;
 548
 549         /*
 550          * If an interrupt was received less than cmc_throttle seconds
 551          * since the previous interrupt and the count from the current
 552          * event is greater than or equal to the current threshold,
 553          * double the threshold up to the max.
 554          */
 555         if (mode == CMCI && valid) {
 556                 if (delta < cmc_throttle && count >= limit &&
 557                     limit < max_threshold) {
 558                         limit = min(limit << 1, max_threshold);
 559                 }
 560                 return (limit);
 561         }
 562
 563         /*
 564          * When the banks are polled, check to see if the threshold
 565          * should be lowered.
 566          */
 567         if (mode != POLLED)
 568                 return (limit);
 569
 570         /* If a CMCI occured recently, do nothing for now. */
 571         if (delta < cmc_throttle)
 572                 return (limit);
 573
 574         /*
 575          * Compute a new limit based on the average rate of events per
 576          * cmc_throttle seconds since the last interrupt.
 577          */
 578         if (valid) {
 579                 limit = count * cmc_throttle / delta;
 580                 if (limit <= 0)
 581                         limit = 1;
 582                 else if (limit > max_threshold)
 583                         limit = max_threshold;
 584         } else {
 585                 limit = 1;
 586         }
 587         return (limit);
 588 }
 589
 590 static void
 591 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
 592 {
 593         struct cmc_state *cc;
 594         uint64_t ctl;
 595         int cur_threshold, new_threshold;
 596         int count;
 597
 598         /* Fetch the current limit for this bank. */
 599         cc = &cmc_state[PCPU_GET(cpuid)][bank];
 600         ctl = rdmsr(MSR_MC_CTL2(bank));
 601         count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
 602         cur_threshold = ctl & MC_CTL2_THRESHOLD;
 603
 604         new_threshold = update_threshold(mode, valid, cc->last_intr, count,
 605             cur_threshold, cc->max_threshold);
 606
 607         if (mode == CMCI && valid)
 608                 cc->last_intr = time_uptime;
 609         if (new_threshold != cur_threshold) {
 610                 ctl &= ~MC_CTL2_THRESHOLD;
 611                 ctl |= new_threshold;
 612                 wrmsr(MSR_MC_CTL2(bank), ctl);
 613         }
 614 }
 615
 616 static void
 617 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
 618 {
 619         struct amd_et_state *cc;
 620         uint64_t misc;
 621         int new_threshold;
 622         int count;
 623
 624         KASSERT(bank == MC_AMDNB_BANK,
 625             ("%s: unexpected bank %d", __func__, bank));
 626         cc = &amd_et_state[PCPU_GET(cpuid)];
 627         misc = rdmsr(MSR_MC_MISC(bank));
 628         count = (misc & MC_MISC_AMDNB_CNT_MASK) >> MC_MISC_AMDNB_CNT_SHIFT;
 629         count = count - (MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold);
 630
 631         new_threshold = update_threshold(mode, valid, cc->last_intr, count,
 632             cc->cur_threshold, MC_MISC_AMDNB_CNT_MAX);
 633
 634         cc->cur_threshold = new_threshold;
 635         misc &= ~MC_MISC_AMDNB_CNT_MASK;
 636         misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
 637             << MC_MISC_AMDNB_CNT_SHIFT;
 638         misc &= ~MC_MISC_AMDNB_OVERFLOW;
 639         wrmsr(MSR_MC_MISC(bank), misc);
 640         if (mode == CMCI && valid)
 641                 cc->last_intr = time_uptime;
 642 }
 643 #endif
 644
 645 /*
 646  * This scans all the machine check banks of the current CPU to see if
 647  * there are any machine checks.  Any non-recoverable errors are
 648  * reported immediately via mca_log().  The current thread must be
 649  * pinned when this is called.  The 'mode' parameter indicates if we
 650  * are being called from the MC exception handler, the CMCI handler,
 651  * or the periodic poller.  In the MC exception case this function
 652  * returns true if the system is restartable.  Otherwise, it returns a
 653  * count of the number of valid MC records found.
 654  */
 655 static int
 656 mca_scan(enum scan_mode mode, int *recoverablep)
 657 {
 658         struct mca_record rec;
 659         uint64_t mcg_cap, ucmask;
 660         int count, i, recoverable, valid;
 661
 662         count = 0;
 663         recoverable = 1;
 664         ucmask = MC_STATUS_UC | MC_STATUS_PCC;
 665
 666         /* When handling a MCE#, treat the OVER flag as non-restartable. */
 667         if (mode == MCE)
 668                 ucmask |= MC_STATUS_OVER;
 669         mcg_cap = rdmsr(MSR_MCG_CAP);
 670         for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 671 #ifdef DEV_APIC
 672                 /*
 673                  * For a CMCI, only check banks this CPU is
 674                  * responsible for.
 675                  */
 676                 if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
 677                         continue;
 678 #endif
 679
 680                 valid = mca_check_status(i, &rec);
 681                 if (valid) {
 682                         count++;
 683                         if (rec.mr_status & ucmask) {
 684                                 recoverable = 0;
 685                                 mtx_lock_spin(&mca_lock);
 686                                 mca_log(&rec);
 687                                 mtx_unlock_spin(&mca_lock);
 688                         }
 689                         mca_record_entry(mode, &rec);
 690                 }
 691
 692 #ifdef DEV_APIC
 693                 /*
 694                  * If this is a bank this CPU monitors via CMCI,
 695                  * update the threshold.
 696                  */
 697                 if (PCPU_GET(cmci_mask) & 1 << i) {
 698                         if (cmc_state != NULL)
 699                                 cmci_update(mode, i, valid, &rec);
 700                         else
 701                                 amd_thresholding_update(mode, i, valid);
 702                 }
 703 #endif
 704         }
 705         if (mode == POLLED)
 706                 mca_fill_freelist();
 707         if (recoverablep != NULL)
 708                 *recoverablep = recoverable;
 709         return (count);
 710 }
 711
 712 /*
 713  * Scan the machine check banks on all CPUs by binding to each CPU in
 714  * turn.  If any of the CPUs contained new machine check records, log
 715  * them to the console.
 716  */
 717 static void
 718 mca_scan_cpus(void *context, int pending)
 719 {
 720         struct mca_internal *mca;
 721         struct thread *td;
 722         int count, cpu;
 723
 724         mca_fill_freelist();
 725         td = curthread;
 726         count = 0;
 727         thread_lock(td);
 728         CPU_FOREACH(cpu) {
 729                 sched_bind(td, cpu);
 730                 thread_unlock(td);
 731                 count += mca_scan(POLLED, NULL);
 732                 thread_lock(td);
 733                 sched_unbind(td);
 734         }
 735         thread_unlock(td);
 736         if (count != 0) {
 737                 mtx_lock_spin(&mca_lock);
 738                 STAILQ_FOREACH(mca, &mca_records, link) {
 739                         if (!mca->logged) {
 740                                 mca->logged = 1;
 741                                 mca_log(&mca->rec);
 742                         }
 743                 }
 744                 mtx_unlock_spin(&mca_lock);
 745         }
 746 }
 747
 748 static void
 749 mca_periodic_scan(void *arg)
 750 {
 751
 752         taskqueue_enqueue(mca_tq, &mca_scan_task);
 753         callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 754 }
 755
 756 static int
 757 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
 758 {
 759         int error, i;
 760
 761         i = 0;
 762         error = sysctl_handle_int(oidp, &i, 0, req);
 763         if (error)
 764                 return (error);
 765         if (i)
 766                 taskqueue_enqueue(mca_tq, &mca_scan_task);
 767         return (0);
 768 }
 769
 770 static void
 771 mca_createtq(void *dummy)
 772 {
 773         if (mca_banks <= 0)
 774                 return;
 775
 776         mca_tq = taskqueue_create_fast("mca", M_WAITOK,
 777             taskqueue_thread_enqueue, &mca_tq);
 778         taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
 779
 780         /* CMCIs during boot may have claimed items from the freelist. */
 781         mca_fill_freelist();
 782 }
 783 SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
 784
 785 static void
 786 mca_startup(void *dummy)
 787 {
 788
 789         if (mca_banks <= 0)
 790                 return;
 791
 792         callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 793 }
 794 #ifdef EARLY_AP_STARTUP
 795 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
 796 #else
 797 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
 798 #endif
 799
 800 #ifdef DEV_APIC
 801 static void
 802 cmci_setup(void)
 803 {
 804         int i;
 805
 806         cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
 807             M_WAITOK);
 808         for (i = 0; i <= mp_maxid; i++)
 809                 cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
 810                     M_MCA, M_WAITOK | M_ZERO);
 811         SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 812             "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 813             &cmc_throttle, 0, sysctl_positive_int, "I",
 814             "Interval in seconds to throttle corrected MC interrupts");
 815 }
 816
 817 static void
 818 amd_thresholding_setup(void)
 819 {
 820
 821         amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state),
 822             M_MCA, M_WAITOK | M_ZERO);
 823         SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 824             "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 825             &cmc_throttle, 0, sysctl_positive_int, "I",
 826             "Interval in seconds to throttle corrected MC interrupts");
 827 }
 828 #endif
 829
 830 static void
 831 mca_setup(uint64_t mcg_cap)
 832 {
 833
 834         /*
 835          * On AMD Family 10h processors, unless logging of level one TLB
 836          * parity (L1TP) errors is disabled, enable the recommended workaround
 837          * for Erratum 383.
 838          */
 839         if (cpu_vendor_id == CPU_VENDOR_AMD &&
 840             CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
 841                 workaround_erratum383 = 1;
 842
 843         mca_banks = mcg_cap & MCG_CAP_COUNT;
 844         mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
 845         STAILQ_INIT(&mca_records);
 846         TASK_INIT(&mca_scan_task, 0, mca_scan_cpus, NULL);
 847         callout_init(&mca_timer, 1);
 848         STAILQ_INIT(&mca_freelist);
 849         TASK_INIT(&mca_refill_task, 0, mca_refill, NULL);
 850         mca_fill_freelist();
 851         SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 852             "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
 853             "Record count");
 854         SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 855             "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
 856             0, sysctl_positive_int, "I",
 857             "Periodic interval in seconds to scan for machine checks");
 858         SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 859             "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
 860         SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 861             "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
 862             sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
 863 #ifdef DEV_APIC
 864         if (mcg_cap & MCG_CAP_CMCI_P)
 865                 cmci_setup();
 866         else if (amd_thresholding_supported())
 867                 amd_thresholding_setup();
 868 #endif
 869 }
 870
 871 #ifdef DEV_APIC
 872 /*
 873  * See if we should monitor CMCI for this bank.  If CMCI_EN is already
 874  * set in MC_CTL2, then another CPU is responsible for this bank, so
 875  * ignore it.  If CMCI_EN returns zero after being set, then this bank
 876  * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
 877  * now monitor this bank.
 878  */
 879 static void
 880 cmci_monitor(int i)
 881 {
 882         struct cmc_state *cc;
 883         uint64_t ctl;
 884
 885         KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 886
 887         ctl = rdmsr(MSR_MC_CTL2(i));
 888         if (ctl & MC_CTL2_CMCI_EN)
 889                 /* Already monitored by another CPU. */
 890                 return;
 891
 892         /* Set the threshold to one event for now. */
 893         ctl &= ~MC_CTL2_THRESHOLD;
 894         ctl |= MC_CTL2_CMCI_EN | 1;
 895         wrmsr(MSR_MC_CTL2(i), ctl);
 896         ctl = rdmsr(MSR_MC_CTL2(i));
 897         if (!(ctl & MC_CTL2_CMCI_EN))
 898                 /* This bank does not support CMCI. */
 899                 return;
 900
 901         cc = &cmc_state[PCPU_GET(cpuid)][i];
 902
 903         /* Determine maximum threshold. */
 904         ctl &= ~MC_CTL2_THRESHOLD;
 905         ctl |= 0x7fff;
 906         wrmsr(MSR_MC_CTL2(i), ctl);
 907         ctl = rdmsr(MSR_MC_CTL2(i));
 908         cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
 909
 910         /* Start off with a threshold of 1. */
 911         ctl &= ~MC_CTL2_THRESHOLD;
 912         ctl |= 1;
 913         wrmsr(MSR_MC_CTL2(i), ctl);
 914
 915         /* Mark this bank as monitored. */
 916         PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
 917 }
 918
 919 /*
 920  * For resume, reset the threshold for any banks we monitor back to
 921  * one and throw away the timestamp of the last interrupt.
 922  */
 923 static void
 924 cmci_resume(int i)
 925 {
 926         struct cmc_state *cc;
 927         uint64_t ctl;
 928
 929         KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 930
 931         /* Ignore banks not monitored by this CPU. */
 932         if (!(PCPU_GET(cmci_mask) & 1 << i))
 933                 return;
 934
 935         cc = &cmc_state[PCPU_GET(cpuid)][i];
 936         cc->last_intr = 0;
 937         ctl = rdmsr(MSR_MC_CTL2(i));
 938         ctl &= ~MC_CTL2_THRESHOLD;
 939         ctl |= MC_CTL2_CMCI_EN | 1;
 940         wrmsr(MSR_MC_CTL2(i), ctl);
 941 }
 942
 943 static void
 944 amd_thresholding_start(struct amd_et_state *cc)
 945 {
 946         uint64_t misc;
 947
 948         KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
 949         misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
 950         misc &= ~MC_MISC_AMDNB_INT_MASK;
 951         misc |= MC_MISC_AMDNB_INT_LVT;
 952         misc &= ~MC_MISC_AMDNB_LVT_MASK;
 953         misc |= (uint64_t)amd_elvt << MC_MISC_AMDNB_LVT_SHIFT;
 954         misc &= ~MC_MISC_AMDNB_CNT_MASK;
 955         misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
 956             << MC_MISC_AMDNB_CNT_SHIFT;
 957         misc &= ~MC_MISC_AMDNB_OVERFLOW;
 958         misc |= MC_MISC_AMDNB_CNTEN;
 959
 960         wrmsr(MSR_MC_MISC(MC_AMDNB_BANK), misc);
 961 }
 962
 963 static void
 964 amd_thresholding_init(void)
 965 {
 966         struct amd_et_state *cc;
 967         uint64_t misc;
 968
 969         /* The counter must be valid and present. */
 970         misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
 971         if ((misc & (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) !=
 972             (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP))
 973                 return;
 974
 975         /* The register should not be locked. */
 976         if ((misc & MC_MISC_AMDNB_LOCK) != 0)
 977                 return;
 978
 979         /*
 980          * If counter is enabled then either the firmware or another CPU
 981          * has already claimed it.
 982          */
 983         if ((misc & MC_MISC_AMDNB_CNTEN) != 0)
 984                 return;
 985
 986         /*
 987          * Configure an Extended Interrupt LVT register for reporting
 988          * counter overflows if that feature is supported and the first
 989          * extended register is available.
 990          */
 991         amd_elvt = lapic_enable_mca_elvt();
 992         if (amd_elvt < 0)
 993                 return;
 994
 995         /* Re-use Intel CMC support infrastructure. */
 996         cc = &amd_et_state[PCPU_GET(cpuid)];
 997         cc->cur_threshold = 1;
 998         amd_thresholding_start(cc);
 999
1000         /* Mark the NB bank as monitored. */
1001         PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << MC_AMDNB_BANK);
1002 }
1003
1004 static void
1005 amd_thresholding_resume(void)
1006 {
1007         struct amd_et_state *cc;
1008
1009         /* Nothing to do if this CPU doesn't monitor the NB bank. */
1010         if ((PCPU_GET(cmci_mask) & 1 << MC_AMDNB_BANK) == 0)
1011                 return;
1012
1013         cc = &amd_et_state[PCPU_GET(cpuid)];
1014         cc->last_intr = 0;
1015         cc->cur_threshold = 1;
1016         amd_thresholding_start(cc);
1017 }
1018 #endif
1019
1020 /*
1021  * Initializes per-CPU machine check registers and enables corrected
1022  * machine check interrupts.
1023  */
1024 static void
1025 _mca_init(int boot)
1026 {
1027         uint64_t mcg_cap;
1028         uint64_t ctl, mask;
1029         int i, skip;
1030
1031         /* MCE is required. */
1032         if (!mca_enabled || !(cpu_feature & CPUID_MCE))
1033                 return;
1034
1035         if (cpu_feature & CPUID_MCA) {
1036                 if (boot)
1037                         PCPU_SET(cmci_mask, 0);
1038
1039                 mcg_cap = rdmsr(MSR_MCG_CAP);
1040                 if (mcg_cap & MCG_CAP_CTL_P)
1041                         /* Enable MCA features. */
1042                         wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
1043                 if (PCPU_GET(cpuid) == 0 && boot)
1044                         mca_setup(mcg_cap);
1045
1046                 /*
1047                  * Disable logging of level one TLB parity (L1TP) errors by
1048                  * the data cache as an alternative workaround for AMD Family
1049                  * 10h Erratum 383.  Unlike the recommended workaround, there
1050                  * is no performance penalty to this workaround.  However,
1051                  * L1TP errors will go unreported.
1052                  */
1053                 if (cpu_vendor_id == CPU_VENDOR_AMD &&
1054                     CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
1055                         mask = rdmsr(MSR_MC0_CTL_MASK);
1056                         if ((mask & (1UL << 5)) == 0)
1057                                 wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
1058                 }
1059                 for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1060                         /* By default enable logging of all errors. */
1061                         ctl = 0xffffffffffffffffUL;
1062                         skip = 0;
1063
1064                         if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1065                                 /*
1066                                  * For P6 models before Nehalem MC0_CTL is
1067                                  * always enabled and reserved.
1068                                  */
1069                                 if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
1070                                     && CPUID_TO_MODEL(cpu_id) < 0x1a)
1071                                         skip = 1;
1072                         } else if (cpu_vendor_id == CPU_VENDOR_AMD) {
1073                                 /* BKDG for Family 10h: unset GartTblWkEn. */
1074                                 if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
1075                                         ctl &= ~(1UL << 10);
1076                         }
1077
1078                         if (!skip)
1079                                 wrmsr(MSR_MC_CTL(i), ctl);
1080
1081 #ifdef DEV_APIC
1082                         if (mcg_cap & MCG_CAP_CMCI_P) {
1083                                 if (boot)
1084                                         cmci_monitor(i);
1085                                 else
1086                                         cmci_resume(i);
1087                         }
1088 #endif
1089
1090                         /* Clear all errors. */
1091                         wrmsr(MSR_MC_STATUS(i), 0);
1092                 }
1093
1094 #ifdef DEV_APIC
1095                 /*
1096                  * AMD Processors from families 10h - 16h provide support
1097                  * for Machine Check Error Thresholding.
1098                  * The processors support counters of MC errors and they
1099                  * can be configured to generate an interrupt when a counter
1100                  * overflows.
1101                  * The counters are all associated with Bank 4 and each
1102                  * of them covers a group of errors reported via that bank.
1103                  * At the moment only the DRAM Error Threshold Group is
1104                  * supported.
1105                  */
1106                 if (amd_thresholding_supported() &&
1107                     (mcg_cap & MCG_CAP_COUNT) >= 4) {
1108                         if (boot)
1109                                 amd_thresholding_init();
1110                         else
1111                                 amd_thresholding_resume();
1112                 } else if (PCPU_GET(cmci_mask) != 0 && boot) {
1113                         lapic_enable_cmc();
1114                 }
1115 #endif
1116         }
1117
1118         load_cr4(rcr4() | CR4_MCE);
1119 }
1120
1121 /* Must be executed on each CPU during boot. */
1122 void
1123 mca_init(void)
1124 {
1125
1126         _mca_init(1);
1127 }
1128
1129 /* Must be executed on each CPU during resume. */
1130 void
1131 mca_resume(void)
1132 {
1133
1134         _mca_init(0);
1135 }
1136
1137 /*
1138  * The machine check registers for the BSP cannot be initialized until
1139  * the local APIC is initialized.  This happens at SI_SUB_CPU,
1140  * SI_ORDER_SECOND.
1141  */
1142 static void
1143 mca_init_bsp(void *arg __unused)
1144 {
1145
1146         mca_init();
1147 }
1148 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
1149
1150 /* Called when a machine check exception fires. */
1151 void
1152 mca_intr(void)
1153 {
1154         uint64_t mcg_status;
1155         int recoverable, count;
1156
1157         if (!(cpu_feature & CPUID_MCA)) {
1158                 /*
1159                  * Just print the values of the old Pentium registers
1160                  * and panic.
1161                  */
1162                 printf("MC Type: 0x%jx  Address: 0x%jx\n",
1163                     (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
1164                     (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
1165                 panic("Machine check");
1166         }
1167
1168         /* Scan the banks and check for any non-recoverable errors. */
1169         count = mca_scan(MCE, &recoverable);
1170         mcg_status = rdmsr(MSR_MCG_STATUS);
1171         if (!(mcg_status & MCG_STATUS_RIPV))
1172                 recoverable = 0;
1173
1174         if (!recoverable) {
1175                 /*
1176                  * Only panic if the error was detected local to this CPU.
1177                  * Some errors will assert a machine check on all CPUs, but
1178                  * only certain CPUs will find a valid bank to log.
1179                  */
1180                 while (count == 0)
1181                         cpu_spinwait();
1182
1183                 panic("Unrecoverable machine check exception");
1184         }
1185
1186         /* Clear MCIP. */
1187         wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1188 }
1189
1190 #ifdef DEV_APIC
1191 /* Called for a CMCI (correctable machine check interrupt). */
1192 void
1193 cmc_intr(void)
1194 {
1195         struct mca_internal *mca;
1196         int count;
1197
1198         /*
1199          * Serialize MCA bank scanning to prevent collisions from
1200          * sibling threads.
1201          */
1202         count = mca_scan(CMCI, NULL);
1203
1204         /* If we found anything, log them to the console. */
1205         if (count != 0) {
1206                 mtx_lock_spin(&mca_lock);
1207                 STAILQ_FOREACH(mca, &mca_records, link) {
1208                         if (!mca->logged) {
1209                                 mca->logged = 1;
1210                                 mca_log(&mca->rec);
1211                         }
1212                 }
1213                 mtx_unlock_spin(&mca_lock);
1214         }
1215 }
1216 #endif