diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries/ras.c')
-rw-r--r-- | arch/powerpc/platforms/pseries/ras.c | 308 |
1 files changed, 305 insertions, 3 deletions
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 851ce326874a..d97d52772789 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -27,6 +27,7 @@ #include <asm/machdep.h> #include <asm/rtas.h> #include <asm/firmware.h> +#include <asm/mce.h> #include "pseries.h" @@ -50,6 +51,101 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); static irqreturn_t ras_error_interrupt(int irq, void *dev_id); +/* RTAS pseries MCE errorlog section. */ +struct pseries_mc_errorlog { + __be32 fru_id; + __be32 proc_id; + u8 error_type; + /* + * sub_err_type (1 byte). Bit fields depends on error_type + * + * MSB0 + * | + * V + * 01234567 + * XXXXXXXX + * + * For error_type == MC_ERROR_TYPE_UE + * XXXXXXXX + * X 1: Permanent or Transient UE. + * X 1: Effective address provided. + * X 1: Logical address provided. + * XX 2: Reserved. + * XXX 3: Type of UE error. + * + * For error_type != MC_ERROR_TYPE_UE + * XXXXXXXX + * X 1: Effective address provided. + * XXXXX 5: Reserved. + * XX 2: Type of SLB/ERAT/TLB error. + */ + u8 sub_err_type; + u8 reserved_1[6]; + __be64 effective_address; + __be64 logical_address; +} __packed; + +/* RTAS pseries MCE error types */ +#define MC_ERROR_TYPE_UE 0x00 +#define MC_ERROR_TYPE_SLB 0x01 +#define MC_ERROR_TYPE_ERAT 0x02 +#define MC_ERROR_TYPE_TLB 0x04 +#define MC_ERROR_TYPE_D_CACHE 0x05 +#define MC_ERROR_TYPE_I_CACHE 0x07 + +/* RTAS pseries MCE error sub types */ +#define MC_ERROR_UE_INDETERMINATE 0 +#define MC_ERROR_UE_IFETCH 1 +#define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 +#define MC_ERROR_UE_LOAD_STORE 3 +#define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 + +#define MC_ERROR_SLB_PARITY 0 +#define MC_ERROR_SLB_MULTIHIT 1 +#define MC_ERROR_SLB_INDETERMINATE 2 + +#define MC_ERROR_ERAT_PARITY 1 +#define MC_ERROR_ERAT_MULTIHIT 2 +#define MC_ERROR_ERAT_INDETERMINATE 3 + +#define MC_ERROR_TLB_PARITY 1 +#define MC_ERROR_TLB_MULTIHIT 2 +#define MC_ERROR_TLB_INDETERMINATE 3 + +static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) +{ + switch (mlog->error_type) { + case MC_ERROR_TYPE_UE: + return (mlog->sub_err_type & 0x07); + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + return (mlog->sub_err_type & 0x03); + default: + return 0; + } +} + +static +inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) +{ + __be64 addr = 0; + + switch (mlog->error_type) { + case MC_ERROR_TYPE_UE: + if (mlog->sub_err_type & 0x40) + addr = mlog->effective_address; + break; + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + if (mlog->sub_err_type & 0x80) + addr = mlog->effective_address; + default: + break; + } + return be64_to_cpu(addr); +} /* * Enable the hotplug interrupt late because processing them may touch other @@ -237,8 +333,9 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) * hotplug events on the ras_log_buf to be handled by rtas_errd. */ if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || - hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) - queue_hotplug_event(hp_elog, NULL, NULL); + hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU || + hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM) + queue_hotplug_event(hp_elog); else log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); @@ -427,6 +524,188 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } +#define VAL_TO_STRING(ar, val) \ + (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") + +static void pseries_print_mce_info(struct pt_regs *regs, + struct rtas_error_log *errp) +{ + const char *level, *sevstr; + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + u8 error_type, err_sub_type; + u64 addr; + u8 initiator = rtas_error_initiator(errp); + int disposition = rtas_error_disposition(errp); + + static const char * const initiators[] = { + "Unknown", + "CPU", + "PCI", + "ISA", + "Memory", + "Power Mgmt", + }; + static const char * const mc_err_types[] = { + "UE", + "SLB", + "ERAT", + "TLB", + "D-Cache", + "Unknown", + "I-Cache", + }; + static const char * const mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + + /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ + static const char * const mc_slb_types[] = { + "Parity", + "Multihit", + "Indeterminate", + }; + + /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ + static const char * const mc_soft_types[] = { + "Unknown", + "Parity", + "Multihit", + "Indeterminate", + }; + + if (!rtas_error_extended(errp)) { + pr_err("Machine check interrupt: Missing extended error log\n"); + return; + } + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (pseries_log == NULL) + return; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + + switch (rtas_error_severity(errp)) { + case RTAS_SEVERITY_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case RTAS_SEVERITY_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case RTAS_SEVERITY_ERROR: + case RTAS_SEVERITY_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case RTAS_SEVERITY_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Display faulty slb contents for SLB errors. */ + if (error_type == MC_ERROR_TYPE_SLB) + slb_dump_contents(local_paca->mce_faulty_slbs); +#endif + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + disposition == RTAS_DISP_FULLY_RECOVERED ? + "Recovered" : "Not recovered"); + if (user_mode(regs)) { + printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, + regs->nip, current->pid, current->comm); + } else { + printk("%s NIP [%016lx]: %pS\n", level, regs->nip, + (void *)regs->nip); + } + printk("%s Initiator: %s\n", level, + VAL_TO_STRING(initiators, initiator)); + + switch (error_type) { + case MC_ERROR_TYPE_UE: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_ue_types, err_sub_type)); + break; + case MC_ERROR_TYPE_SLB: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_slb_types, err_sub_type)); + break; + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_soft_types, err_sub_type)); + break; + default: + printk("%s Error type: %s\n", level, + VAL_TO_STRING(mc_err_types, error_type)); + break; + } + + addr = rtas_mc_get_effective_addr(mce_log); + if (addr) + printk("%s Effective address: %016llx\n", level, addr); +} + +static int mce_handle_error(struct rtas_error_log *errp) +{ + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + int disposition = rtas_error_disposition(errp); + u8 error_type; + + if (!rtas_error_extended(errp)) + goto out; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (pseries_log == NULL) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + +#ifdef CONFIG_PPC_BOOK3S_64 + if (disposition == RTAS_DISP_NOT_RECOVERED) { + switch (error_type) { + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + /* + * Store the old slb content in paca before flushing. + * Print this when we go to virtual mode. + * There are chances that we may hit MCE again if there + * is a parity error on the SLB entry we trying to read + * for saving. Hence limit the slb saving to single + * level of recursion. + */ + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); + flush_and_reload_slb(); + disposition = RTAS_DISP_FULLY_RECOVERED; + rtas_set_disposition_recovered(errp); + break; + default: + break; + } + } +#endif + +out: + return disposition; +} + /* * Process MCE rtas errlog event. */ @@ -452,8 +731,11 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) int recovered = 0; int disposition = rtas_error_disposition(err); + pseries_print_mce_info(regs, err); + if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ + pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { @@ -503,11 +785,31 @@ int pSeries_machine_check_exception(struct pt_regs *regs) struct rtas_error_log *errp; if (fwnmi_active) { - errp = fwnmi_get_errinfo(regs); fwnmi_release_errinfo(); + errp = fwnmi_get_errlog(); if (errp && recover_mce(regs, errp)) return 1; } return 0; } + +long pseries_machine_check_realmode(struct pt_regs *regs) +{ + struct rtas_error_log *errp; + int disposition; + + if (fwnmi_active) { + errp = fwnmi_get_errinfo(regs); + /* + * Call to fwnmi_release_errinfo() in real mode causes kernel + * to panic. Hence we will call it as soon as we go into + * virtual mode. + */ + disposition = mce_handle_error(errp); + if (disposition == RTAS_DISP_FULLY_RECOVERED) + return 1; + } + + return 0; +} |