x86/mce: Separate global and per-CPU quirks

Many quirks are global configuration settings and a handful apply to
each CPU.

Move the per-CPU quirks to vendor init to execute them on each online
CPU. Set the global quirks during BSP-only init so they're only executed
once and early.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com
This commit is contained in:
Yazen Ghannam 2025-09-08 15:40:34 +00:00 committed by Borislav Petkov (AMD)
parent a46b2bbe1e
commit 7eee1e9268
3 changed files with 65 additions and 62 deletions

View File

@ -646,6 +646,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
wrmsrq(MSR_K7_HWCR, hwcr);
}
static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
/* This should be disabled by the BIOS, but isn't always */
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
* & Cerberus:
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
/*
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
if (c->x86 == 6 && this_cpu_read(mce_num_banks))
mce_banks[0].ctl = 0;
}
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@ -653,6 +675,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
u32 low = 0, high = 0, address = 0;
int offset = -1;
amd_apply_cpu_quirks(c);
mce_flags.amd_threshold = 1;
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {

View File

@ -1807,8 +1807,9 @@ static void __mcheck_cpu_mce_banks_init(void)
struct mce_bank *b = &mce_banks[i];
/*
* Init them all, __mcheck_cpu_apply_quirks() is going to apply
* the required vendor quirks before
* Init them all by default.
*
* The required vendor quirks will be applied before
* __mcheck_cpu_init_prepare_banks() does the final bank setup.
*/
b->ctl = -1ULL;
@ -1880,20 +1881,8 @@ static void __mcheck_cpu_init_prepare_banks(void)
}
}
static void apply_quirks_amd(struct cpuinfo_x86 *c)
static void amd_apply_global_quirks(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
/* This should be disabled by the BIOS, but isn't always */
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
* & Cerberus:
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
@ -1902,13 +1891,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
mca_cfg.bootlog = 0;
}
/*
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
if (c->x86 == 6 && this_cpu_read(mce_num_banks))
mce_banks[0].ctl = 0;
/*
* overflow_recov is supported for F15h Models 00h-0fh
* even though we don't have a CPUID bit for it.
@ -1920,25 +1902,12 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
mce_flags.zen_ifu_quirk = 1;
}
static void apply_quirks_intel(struct cpuinfo_x86 *c)
static void intel_apply_global_quirks(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
/* Older CPUs (prior to family 6) don't need quirks. */
if (c->x86_vfm < INTEL_PENTIUM_PRO)
return;
/*
* SDM documents that on family 6 bank 0 should not be written
* because it aliases to another special BIOS controlled
* register.
* But it's not aliased anymore on model 0x1a+
* Don't ignore bank 0 completely because there could be a
* valid event later, merely don't write CTL0.
*/
if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
mce_banks[0].init = false;
/*
* All newer Intel systems support MCE broadcasting. Enable
* synchronization with a one second timeout.
@ -1964,7 +1933,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c)
mce_flags.skx_repmov_quirk = 1;
}
static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c)
{
/*
* All newer Zhaoxin CPUs support MCE broadcasting. Enable
@ -1976,29 +1945,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
}
}
/* Add per CPU specific workarounds here */
static void __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
struct mca_config *cfg = &mca_cfg;
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
apply_quirks_amd(c);
break;
case X86_VENDOR_INTEL:
apply_quirks_intel(c);
break;
case X86_VENDOR_ZHAOXIN:
apply_quirks_zhaoxin(c);
break;
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
if (cfg->bootlog != 0)
cfg->panic_timeout = 30;
}
static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
@ -2256,6 +2202,23 @@ void mca_bsp_init(struct cpuinfo_x86 *c)
if (cap & MCG_SER_P)
mca_cfg.ser = 1;
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
amd_apply_global_quirks(c);
break;
case X86_VENDOR_INTEL:
intel_apply_global_quirks(c);
break;
case X86_VENDOR_ZHAOXIN:
zhaoxin_apply_global_quirks(c);
break;
}
if (mca_cfg.monarch_timeout < 0)
mca_cfg.monarch_timeout = 0;
if (mca_cfg.bootlog != 0)
mca_cfg.panic_timeout = 30;
}
/*
@ -2275,8 +2238,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_cap_init();
__mcheck_cpu_apply_quirks(c);
if (!mce_gen_pool_init()) {
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n");

View File

@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
}
}
static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c)
{
/*
* SDM documents that on family 6 bank 0 should not be written
* because it aliases to another special BIOS controlled
* register.
* But it's not aliased anymore on model 0x1a+
* Don't ignore bank 0 completely because there could be a
* valid event later, merely don't write CTL0.
*
* Older CPUs (prior to family 6) can't reach this point and already
* return early due to the check of __mcheck_cpu_ancient_init().
*/
if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
this_cpu_ptr(mce_banks_array)[0].init = false;
}
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_apply_cpu_quirks(c);
intel_init_cmci();
intel_init_lmce();
intel_imc_init(c);