1
0
Fork 0

EDAC, skx_edac: Add address translation for non-volatile DIMMs

Currently, this driver doesn't support address translation for
non-volatile DIMMs.

The ACPI ADXL DSM method provides address translation for both volatile
and non-volatile DIMMs. Enable it to use the ACPI DSM methods if they
are supported and there are non-volatile DIMMs populated on the system.

Co-developed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Mauro Carvalho Chehab <mchehab@kernel.org>
CC: arozansk@redhat.com
CC: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1540106336-5212-1-git-send-email-qiuxu.zhuo@intel.com
hifive-unleashed-5.1
Qiuxu Zhuo 2018-10-21 15:18:56 +08:00 committed by Borislav Petkov
parent 36168d7123
commit ad6e16059d
2 changed files with 181 additions and 13 deletions

View File

@ -234,6 +234,7 @@ config EDAC_SKX
depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y
select DMI select DMI
select ACPI_ADXL
help help
Support for error detection and correction the Intel Support for error detection and correction the Intel
Skylake server Integrated Memory Controllers. If your Skylake server Integrated Memory Controllers. If your

View File

@ -26,6 +26,7 @@
#include <linux/bitmap.h> #include <linux/bitmap.h>
#include <linux/math64.h> #include <linux/math64.h>
#include <linux/mod_devicetable.h> #include <linux/mod_devicetable.h>
#include <linux/adxl.h>
#include <acpi/nfit.h> #include <acpi/nfit.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
#include <asm/intel-family.h> #include <asm/intel-family.h>
@ -35,6 +36,7 @@
#include "edac_module.h" #include "edac_module.h"
#define EDAC_MOD_STR "skx_edac" #define EDAC_MOD_STR "skx_edac"
#define MSG_SIZE 1024
/* /*
* Debug macros * Debug macros
@ -54,6 +56,29 @@
static LIST_HEAD(skx_edac_list); static LIST_HEAD(skx_edac_list);
static u64 skx_tolm, skx_tohm; static u64 skx_tolm, skx_tohm;
static char *skx_msg;
static unsigned int nvdimm_count;
enum {
INDEX_SOCKET,
INDEX_MEMCTRL,
INDEX_CHANNEL,
INDEX_DIMM,
INDEX_MAX
};
static const char * const component_names[] = {
[INDEX_SOCKET] = "ProcessorSocketId",
[INDEX_MEMCTRL] = "MemoryControllerId",
[INDEX_CHANNEL] = "ChannelId",
[INDEX_DIMM] = "DimmSlotId",
};
static int component_indices[ARRAY_SIZE(component_names)];
static int adxl_component_count;
static const char * const *adxl_component_names;
static u64 *adxl_values;
static char *adxl_msg;
#define NUM_IMC 2 /* memory controllers per socket */ #define NUM_IMC 2 /* memory controllers per socket */
#define NUM_CHANNELS 3 /* channels per memory controller */ #define NUM_CHANNELS 3 /* channels per memory controller */
@ -393,6 +418,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
u16 flags; u16 flags;
u64 size = 0; u64 size = 0;
nvdimm_count++;
dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc, dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
imc->src_id, 0); imc->src_id, 0);
@ -941,12 +968,46 @@ static void teardown_skx_debug(void)
} }
#endif /*CONFIG_EDAC_DEBUG*/ #endif /*CONFIG_EDAC_DEBUG*/
static bool skx_adxl_decode(struct decoded_addr *res)
{
int i, len = 0;
if (res->addr >= skx_tohm || (res->addr >= skx_tolm &&
res->addr < BIT_ULL(32))) {
edac_dbg(0, "Address 0x%llx out of range\n", res->addr);
return false;
}
if (adxl_decode(res->addr, adxl_values)) {
edac_dbg(0, "Failed to decode 0x%llx\n", res->addr);
return false;
}
res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]];
for (i = 0; i < adxl_component_count; i++) {
if (adxl_values[i] == ~0x0ull)
continue;
len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx",
adxl_component_names[i], adxl_values[i]);
if (MSG_SIZE - len <= 0)
break;
}
return true;
}
static void skx_mce_output_error(struct mem_ctl_info *mci, static void skx_mce_output_error(struct mem_ctl_info *mci,
const struct mce *m, const struct mce *m,
struct decoded_addr *res) struct decoded_addr *res)
{ {
enum hw_event_mc_err_type tp_event; enum hw_event_mc_err_type tp_event;
char *type, *optype, msg[256]; char *type, *optype;
bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
bool overflow = GET_BITFIELD(m->status, 62, 62); bool overflow = GET_BITFIELD(m->status, 62, 62);
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@ -1007,22 +1068,47 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
break; break;
} }
} }
if (adxl_component_count) {
snprintf(skx_msg, MSG_SIZE, "%s%s err_code:%04x:%04x %s",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode, adxl_msg);
} else {
snprintf(skx_msg, MSG_SIZE,
"%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode,
res->socket, res->imc, res->rank,
res->bank_group, res->bank_address, res->row, res->column);
}
snprintf(msg, sizeof(msg), edac_dbg(0, "%s\n", skx_msg);
"%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode,
res->socket, res->imc, res->rank,
res->bank_group, res->bank_address, res->row, res->column);
edac_dbg(0, "%s\n", msg);
/* Call the helper to output message */ /* Call the helper to output message */
edac_mc_handle_error(tp_event, mci, core_err_cnt, edac_mc_handle_error(tp_event, mci, core_err_cnt,
m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
res->channel, res->dimm, -1, res->channel, res->dimm, -1,
optype, msg); optype, skx_msg);
}
static struct mem_ctl_info *get_mci(int src_id, int lmc)
{
struct skx_dev *d;
if (lmc > NUM_IMC - 1) {
skx_printk(KERN_ERR, "Bad lmc %d\n", lmc);
return NULL;
}
list_for_each_entry(d, &skx_edac_list, list) {
if (d->imc[0].src_id == src_id)
return d->imc[lmc].mci;
}
skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc);
return NULL;
} }
static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
@ -1040,10 +1126,23 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE; return NOTIFY_DONE;
memset(&res, 0, sizeof(res));
res.addr = mce->addr; res.addr = mce->addr;
if (!skx_decode(&res))
if (adxl_component_count) {
if (!skx_adxl_decode(&res))
return NOTIFY_DONE;
mci = get_mci(res.socket, res.imc);
} else {
if (!skx_decode(&res))
return NOTIFY_DONE;
mci = res.dev->imc[res.imc].mci;
}
if (!mci)
return NOTIFY_DONE; return NOTIFY_DONE;
mci = res.dev->imc[res.imc].mci;
if (mce->mcgstatus & MCG_STATUS_MCIP) if (mce->mcgstatus & MCG_STATUS_MCIP)
type = "Exception"; type = "Exception";
@ -1094,6 +1193,62 @@ static void skx_remove(void)
} }
} }
static void __init skx_adxl_get(void)
{
const char * const *names;
int i, j;
names = adxl_get_component_names();
if (!names) {
skx_printk(KERN_NOTICE, "No firmware support for address translation.");
skx_printk(KERN_CONT, " Only decoding DDR4 address!\n");
return;
}
for (i = 0; i < INDEX_MAX; i++) {
for (j = 0; names[j]; j++) {
if (!strcmp(component_names[i], names[j])) {
component_indices[i] = j;
break;
}
}
if (!names[j])
goto err;
}
adxl_component_names = names;
while (*names++)
adxl_component_count++;
adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values),
GFP_KERNEL);
if (!adxl_values) {
adxl_component_count = 0;
return;
}
adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
if (!adxl_msg) {
adxl_component_count = 0;
kfree(adxl_values);
}
return;
err:
skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
component_names[i]);
for (j = 0; names[j]; j++)
skx_printk(KERN_CONT, "%s ", names[j]);
skx_printk(KERN_CONT, "\n");
}
static void __exit skx_adxl_put(void)
{
kfree(adxl_values);
kfree(adxl_msg);
}
/* /*
* skx_init: * skx_init:
* make sure we are running on the correct cpu model * make sure we are running on the correct cpu model
@ -1158,6 +1313,15 @@ static int __init skx_init(void)
} }
} }
skx_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
if (!skx_msg) {
rc = -ENOMEM;
goto fail;
}
if (nvdimm_count)
skx_adxl_get();
/* Ensure that the OPSTATE is set correctly for POLL or NMI */ /* Ensure that the OPSTATE is set correctly for POLL or NMI */
opstate_init(); opstate_init();
@ -1176,6 +1340,9 @@ static void __exit skx_exit(void)
edac_dbg(2, "\n"); edac_dbg(2, "\n");
mce_unregister_decode_chain(&skx_mce_dec); mce_unregister_decode_chain(&skx_mce_dec);
skx_remove(); skx_remove();
if (nvdimm_count)
skx_adxl_put();
kfree(skx_msg);
teardown_skx_debug(); teardown_skx_debug();
} }