From e28ecc0bcc54bcce43dbe534593ab18c97b31fe9 Mon Sep 17 00:00:00 2001 From: Krishna Dhulipala Date: Tue, 2 May 2023 19:09:27 -0700 Subject: [PATCH 1/6] rasdaemon: ipmitool SEL logging of AER CEs on OpenBMC platforms Log to OpenBMC SEL logs, all the AER correctable errors that are handled by the kernel. The IPMI command record fields used are defined in the IPMI specificaton v2.0. Non-timestamped record type is used here given that the BMC will attach one at the time of logging. Test Plan: Tested on OpenBMC platforms using EINJ. Signed-off-by: Krishna Dhulipala Reviewed-by: Ril Van Riel --- configure.ac | 11 +++++++++++ ras-aer-handler.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/configure.ac b/configure.ac index ab5697d..f64e85b 100644 --- a/configure.ac +++ b/configure.ac @@ -177,6 +177,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" = "xyes"], AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all = xyes]) AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) +AC_ARG_ENABLE([openbmc_unified_sel], + AS_HELP_STRING([--enable-openbmc-unified-sel], [enable OPENBMC_UNIFIED_SEL events (currently experimental)])) + +AS_IF([test "x$enable_openbmc_unified_sel" = "xyes" || test "x$enable_all" = "xyes"], [ + AC_DEFINE(HAVE_OPENBMC_UNIFIED_SEL,1,"have OpenBMC unified SEL") + AC_SUBST([WITH_OPENBMC_UNIFIED_SEL]) +]) +AM_CONDITIONAL([WITH_OPENBMC_UNIFIED_SEL], [test x$enable_openbmc_unified_sel = xyes || test x$enable_all = xyes]) +AM_COND_IF([WITH_OPENBMC_UNIFIED_SEL], [USE_OPENBMC_UNIFIED_SEL="yes"], [USE_OPENBMC_UNIFIED_SEL="no"]) + AC_ARG_ENABLE([cpu_fault_isolation], AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation])) @@ -228,5 +238,6 @@ compile time options summary CXL events : $USE_CXL Memory CE PFA : $USE_MEMORY_CE_PFA AMP RAS errors : $USE_AMP_NS_DECODE + OpenBMC unified : $USE_OPENBMC_UNIFIED_SEL CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/ras-aer-handler.c b/ras-aer-handler.c index 1aa0867..9a2bbdd 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c @@ -68,7 +68,9 @@ int ras_aer_event_handler(struct trace_seq *s, struct ras_aer_event ev; char buf[BUF_LEN]; char ipmi_add_sel[105]; + char openbmc_ipmi_add_sel[105]; uint8_t sel_data[5]; + uint8_t openbmc_sel_data[2]; int seg, bus, dev, fn; /* @@ -182,5 +184,37 @@ int ras_aer_event_handler(struct trace_seq *s, system(ipmi_add_sel); #endif +#ifdef HAVE_OPENBMC_UNIFIED_SEL + /* + * Get PCIe AER error source bus/dev/fn and save it to the BMC SEL + * as a OpenBMC unified SEL record type. + * The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec) + * ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6 + * The 16 byte that follow form the SEL Record + * defined in IPMI spec chapter 32.1 "SEL Event Records" + * Byte 1~2 are Record ID = 0x00 0x00, unused + * Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL + * Byte 4~16 are OEM defined + * Byte 11: + * Byte11[7:3] Device# + * Byte11[2:0] Function# + * Byte 12: Bus number + */ + sscanf(ev.dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn); + + openbmc_sel_data[0] = (((dev & 0x1f) << 3) | (fn & 0x7)); + openbmc_sel_data[1] = bus; + sprintf(openbmc_ipmi_add_sel, + "ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff 0x00", + openbmc_sel_data[0], openbmc_sel_data[1]); + + /* + * Use MSI and kernel logging only for CEs since they are high fidelity errors. + * Whereas for all UEs, stick to using the firmware-first reporting route. + */ + if (severity_val == HW_EVENT_AER_CORRECTED) + system(openbmc_ipmi_add_sel); +#endif + return 0; } From 760e3cb5bf128dac10a861cf0b53ec8bb483faab Mon Sep 17 00:00:00 2001 From: Krishna Dhulipala Date: Tue, 2 May 2023 19:09:27 -0700 Subject: [PATCH 2/6] rasdaemon: ipmitool SEL logging of AER CEs on OpenBMC platforms Log to OpenBMC SEL logs, all the AER correctable errors that are handled by the kernel. The IPMI command record fields used are defined in the IPMI specificaton v2.0. Non-timestamped record type is used here given that the BMC will attach one at the time of logging. Test Plan: Tested on OpenBMC platforms using EINJ. Signed-off-by: Krishna Dhulipala Reviewed-by: Ril Van Riel --- Makefile.am | 6 ++- ras-aer-handler.c | 41 +++++------------ unified-sel.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++ unified-sel.h | 17 +++++++ 4 files changed, 147 insertions(+), 31 deletions(-) create mode 100644 unified-sel.c create mode 100644 unified-sel.h diff --git a/Makefile.am b/Makefile.am index 5bddeac..b948136 100644 --- a/Makefile.am +++ b/Makefile.am @@ -73,7 +73,9 @@ endif if WITH_CPU_FAULT_ISOLATION rasdaemon_SOURCES += ras-cpu-isolation.c queue.c endif - +if WITH_OPENBMC_UNIFIED_SEL + rasdaemon_SOURCES += unified-sel.c +endif if WITH_CXL rasdaemon_SOURCES += ras-cxl-handler.c endif @@ -86,7 +88,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ - ras-cxl-handler.h ras-cpu-isolation.h queue.h + ras-cxl-handler.h ras-cpu-isolation.h queue.h unified-sel.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/ras-aer-handler.c b/ras-aer-handler.c index 9a2bbdd..38dce80 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c @@ -25,6 +25,7 @@ #include "ras-logger.h" #include "bitfield.h" #include "ras-report.h" +#include "unified-sel.h" /* bit field meaning for correctable error */ static const char *aer_cor_errors[32] = { @@ -35,12 +36,15 @@ static const char *aer_cor_errors[32] = { [8] = "RELAY_NUM Rollover", [12] = "Replay Timer Timeout", [13] = "Advisory Non-Fatal", + [14] = "Corrected Internal", + [15] = "Header Log Overflow", }; /* bit field meaning for uncorrectable error */ static const char *aer_uncor_errors[32] = { /* Uncorrectable errors */ [4] = "Data Link Protocol", + [5] = "Surprise Link Down", [12] = "Poisoned TLP", [13] = "Flow Control Protocol", [14] = "Completion Timeout", @@ -50,6 +54,12 @@ static const char *aer_uncor_errors[32] = { [18] = "Malformed TLP", [19] = "ECRC", [20] = "Unsupported Request", + [21] = "ACS Violation", + [22] = "Uncorrected Internal", + [23] = "MC Blocked TLP", + [24] = "AtomicOp Egress Blocked", + [25] = "TLP Prefix Blocked", + [26] = "Poisoned TLP Egrees Blocked", }; #define BUF_LEN 1024 @@ -185,35 +195,8 @@ int ras_aer_event_handler(struct trace_seq *s, #endif #ifdef HAVE_OPENBMC_UNIFIED_SEL - /* - * Get PCIe AER error source bus/dev/fn and save it to the BMC SEL - * as a OpenBMC unified SEL record type. - * The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec) - * ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6 - * The 16 byte that follow form the SEL Record - * defined in IPMI spec chapter 32.1 "SEL Event Records" - * Byte 1~2 are Record ID = 0x00 0x00, unused - * Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL - * Byte 4~16 are OEM defined - * Byte 11: - * Byte11[7:3] Device# - * Byte11[2:0] Function# - * Byte 12: Bus number - */ - sscanf(ev.dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn); - - openbmc_sel_data[0] = (((dev & 0x1f) << 3) | (fn & 0x7)); - openbmc_sel_data[1] = bus; - sprintf(openbmc_ipmi_add_sel, - "ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff 0x00", - openbmc_sel_data[0], openbmc_sel_data[1]); - - /* - * Use MSI and kernel logging only for CEs since they are high fidelity errors. - * Whereas for all UEs, stick to using the firmware-first reporting route. - */ - if (severity_val == HW_EVENT_AER_CORRECTED) - system(openbmc_ipmi_add_sel); + if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0) + return -1; #endif return 0; diff --git a/unified-sel.c b/unified-sel.c new file mode 100644 index 0000000..287bb4f --- /dev/null +++ b/unified-sel.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2023, Meta Platforms Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include "ras-record.h" +#include "ras-logger.h" +#include "ras-report.h" +#include "unified-sel.h" + +/* CPU Root Port Error ID corresponding to each status bit set */ +static const char *cor_error_ids[32] = { + /* Correctable errors */ + [0] = "0x00", /* Receiver Error */ + [6] = "0x01", /* Bad TLP */ + [7] = "0x02", /* Bad DLLP */ + [8] = "0x04", /* RELAY_NUM Rollover */ + [12] = "0x03", /* Replay Timer Timeout */ + [13] = "0x05", /* Advisory Non-Fatal */ + [14] = "0x06", /* Corrected Internal */ + [15] = "0x07", /* Header Log Overflow */ +}; + +static const char *uncor_error_ids[32] = { + /* Uncorrectable errors */ + [4] = "0x20", /* Data Link Protocol */ + [5] = "0x21", /* Surprise Link Down */ + [12] = "0x22", /* Poisoned TLP */ + [13] = "0x23", /* Flow Control Protocol */ + [14] = "0x24", /* Completion Timeout */ + [15] = "0x25", /* Completer Abort */ + [16] = "0x26", /* Unexpected Completion */ + [17] = "0x27", /* Receiver Overflow */ + [18] = "0x29", /* Malformed TLP */ + [19] = "0x29", /* ECRC */ + [20] = "0x2A", /* Unsupported Request */ + [21] = "0x2B", /* ACS Violation */ + [22] = "0x2C", /* Uncorrected Internal */ + [23] = "0x2D", /* MC Blocked TLP */ + [24] = "0x2E", /* AtomicOp Egress Blocked */ + [25] = "0x2F", /* TLP Prefix Blocked */ + [26] = "0x30", /* Poisoned TLP Egrees Blocked */ +}; + +static int verify_id_log_sel(uint64_t status, + const char **idarray, + unsigned bus, + unsigned dev_fn) +{ + int i; + char openbmc_ipmi_add_sel[105]; + + /* + * Get PCIe AER error source bus/dev/fn and save it to the BMC SEL + * as a OpenBMC unified SEL record type. + * The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec) + * ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6 + * The 16 byte that follow form the SEL Record + * defined in IPMI spec chapter 32.1 "SEL Event Records" + * Byte 1~2 are Record ID = 0x00 0x00, unused + * Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL + * Byte 4~16 are OEM defined + * Byte 11: + * Byte11[7:3] Device# + * Byte11[2:0] Function# + * Byte 12: Bus number + * Byte 13-15: Reserved + * Byte 16: ID of the error detected on the PCle device that triggered this SEL record + */ + + /* Potentially all error status bits could be set for a given PCIe device. + * Therefore, iterate over all 32 bits each of cor and uncor errors + */ + for (i = 0; i < 32; i++) { + if ((status & (1 << i)) && idarray[i]) { + sprintf(openbmc_ipmi_add_sel, + "ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff %s", + dev_fn, bus, idarray[i]); + if (system(openbmc_ipmi_add_sel) != 0) + return -1; + } + } + return 0; +} + +int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status) +{ + int bus, dev, dev_fn, fn; + + sscanf(dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn); + dev_fn = (((dev & 0x1f) << 3) | (fn & 0x7)); + + /* Use the appropriate correctable or uncorrectable error status ID + * for a gien severity level + */ + if (severity == HW_EVENT_AER_CORRECTED) { + if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0) + return -1; + } + else { + if (verify_id_log_sel(status, uncor_error_ids, bus, dev_fn) < 0) + return -1; + } + return 0; +} diff --git a/unified-sel.h b/unified-sel.h new file mode 100644 index 0000000..17458a5 --- /dev/null +++ b/unified-sel.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2023, Meta Platforms Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + + +#ifndef _UNIFIED_SEL_H +#define _UNIFIED_SEL_H + +int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status); + +#endif From afb79023c2da3184ad894e875fcdcd6cc8daa82f Mon Sep 17 00:00:00 2001 From: Krishna Dhulipala Date: Tue, 2 May 2023 19:09:27 -0700 Subject: [PATCH 3/6] rasdaemon: ipmitool SEL logging of AER CEs on OpenBMC platforms Log to OpenBMC SEL logs, all the AER correctable errors that are handled by the kernel. The IPMI command record fields used are defined in the IPMI specificaton v2.0. Non-timestamped record type is used here given that the BMC will attach one at the time of logging. Test Plan: Tested on OpenBMC platforms using EINJ. Signed-off-by: Krishna Dhulipala Reviewed-by: Ril Van Riel --- ras-aer-handler.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ras-aer-handler.c b/ras-aer-handler.c index 38dce80..4b5e9f0 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c @@ -78,9 +78,7 @@ int ras_aer_event_handler(struct trace_seq *s, struct ras_aer_event ev; char buf[BUF_LEN]; char ipmi_add_sel[105]; - char openbmc_ipmi_add_sel[105]; uint8_t sel_data[5]; - uint8_t openbmc_sel_data[2]; int seg, bus, dev, fn; /* From 414a8bc15285ab9cad8523b90daf0a86f7843ba0 Mon Sep 17 00:00:00 2001 From: Krishna Dhulipala Date: Tue, 18 Jul 2023 21:14:53 -0700 Subject: [PATCH 4/6] Add runtime argument for CLI logging --- ras-aer-handler.c | 14 ++++++++++++-- ras-aer-handler.h | 1 + ras-events.c | 3 ++- ras-events.h | 2 +- rasdaemon.c | 11 ++++++++++- 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/ras-aer-handler.c b/ras-aer-handler.c index 4b5e9f0..7d05247 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c @@ -62,6 +62,15 @@ static const char *aer_uncor_errors[32] = { [26] = "Poisoned TLP Egrees Blocked", }; +static bool use_ipmitool = false; + +void ras_aer_handler_init(int enable_ipmitool) +{ +#ifdef HAVE_OPENBMC_UNIFIED_SEL + use_ipmitool = (enable_ipmitool > 0) ? 1 : 0; +#endif +} + #define BUF_LEN 1024 int ras_aer_event_handler(struct trace_seq *s, @@ -193,8 +202,9 @@ int ras_aer_event_handler(struct trace_seq *s, #endif #ifdef HAVE_OPENBMC_UNIFIED_SEL - if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0) - return -1; + if (use_ipmitool) + if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0) + return -1; #endif return 0; diff --git a/ras-aer-handler.h b/ras-aer-handler.h index 4394921..438528a 100644 --- a/ras-aer-handler.h +++ b/ras-aer-handler.h @@ -26,4 +26,5 @@ int ras_aer_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); +void ras_aer_handler_init(int enable_ipmitool); #endif diff --git a/ras-events.c b/ras-events.c index 2662467..6e77c96 100644 --- a/ras-events.c +++ b/ras-events.c @@ -814,7 +814,7 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, return 0; } -int handle_ras_events(int record_events) +int handle_ras_events(int record_events, int enable_ipmitool) { int rc, page_size, i; int num_events = 0; @@ -871,6 +871,7 @@ int handle_ras_events(int record_events) "ras", "mc_event"); #ifdef HAVE_AER + ras_aer_handler_init(enable_ipmitool); rc = add_event_handler(ras, pevent, page_size, "ras", "aer_event", ras_aer_event_handler, NULL, AER_EVENT); if (!rc) diff --git a/ras-events.h b/ras-events.h index a9d67c2..1d0a10c 100644 --- a/ras-events.h +++ b/ras-events.h @@ -105,6 +105,6 @@ enum ghes_severity { /* Function prototypes */ int toggle_ras_mc_event(int enable); -int handle_ras_events(int record_events); +int handle_ras_events(int record_events, int enable_ipmitool); #endif diff --git a/rasdaemon.c b/rasdaemon.c index 66f4dea..65ff9ff 100644 --- a/rasdaemon.c +++ b/rasdaemon.c @@ -40,6 +40,7 @@ const char *argp_program_bug_address = "Mauro Carvalho Chehab record_events++; break; +#endif +#ifdef HAVE_OPENBMC_UNIFIED_SEL + case 'i': + args->enable_ipmitool++; + break; #endif case 'f': args->foreground++; @@ -81,6 +87,9 @@ int main(int argc, char *argv[]) {"record", 'r', 0, 0, "record events via sqlite3", 0}, #endif {"foreground", 'f', 0, 0, "run foreground, not daemonize"}, +#ifdef HAVE_OPENBMC_UNIFIED_SEL + {"ipmitool", 'i', 0, 0, "enable ipmitool logging", 0}, +#endif { 0, 0, 0, 0, 0, 0 } }; @@ -116,7 +125,7 @@ int main(int argc, char *argv[]) if (daemon(0,0)) exit(EXIT_FAILURE); - handle_ras_events(args.record_events); + handle_ras_events(args.record_events, args.enable_ipmitool); return 0; } From 626f7c90ff943be9cdc5ca3d3bbeac644bef2485 Mon Sep 17 00:00:00 2001 From: Krishna Dhulipala Date: Mon, 25 Sep 2023 19:29:43 -0700 Subject: [PATCH 5/6] Only log correctable errors for unified SEL Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- unified-sel.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unified-sel.c b/unified-sel.c index 287bb4f..4398da1 100644 --- a/unified-sel.c +++ b/unified-sel.c @@ -106,9 +106,5 @@ int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t st if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0) return -1; } - else { - if (verify_id_log_sel(status, uncor_error_ids, bus, dev_fn) < 0) - return -1; - } return 0; } From bcffec13f53796fe150f6464540d5e324ea6977d Mon Sep 17 00:00:00 2001 From: Krishna Dhulipala Date: Mon, 25 Sep 2023 19:36:05 -0700 Subject: [PATCH 6/6] Fix typo in comment --- unified-sel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-sel.c b/unified-sel.c index 4398da1..2fcdcc0 100644 --- a/unified-sel.c +++ b/unified-sel.c @@ -99,8 +99,8 @@ int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t st sscanf(dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn); dev_fn = (((dev & 0x1f) << 3) | (fn & 0x7)); - /* Use the appropriate correctable or uncorrectable error status ID - * for a gien severity level + /* Use the appropriate correctable error status ID + * for a given severity level */ if (severity == HW_EVENT_AER_CORRECTED) { if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0)