diff --git a/Makefile.am b/Makefile.am index d4872a9..f53be9c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -78,7 +78,9 @@ endif if WITH_CPU_FAULT_ISOLATION rasdaemon_SOURCES += ras-cpu-isolation.c queue.c endif - +if WITH_OPENBMC_UNIFIED_SEL + rasdaemon_SOURCES += unified-sel.c +endif if WITH_CXL rasdaemon_SOURCES += ras-cxl-handler.c endif @@ -98,7 +100,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ - non-standard-jaguarmicro.h trigger.h + non-standard-jaguarmicro.h trigger.h unified-sel.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac index 1059f3c..be3b254 100644 --- a/configure.ac +++ b/configure.ac @@ -192,6 +192,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" = "xyes"], AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all = xyes]) AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) +AC_ARG_ENABLE([openbmc_unified_sel], + AS_HELP_STRING([--enable-openbmc-unified-sel], [enable OPENBMC_UNIFIED_SEL events (currently experimental)])) + +AS_IF([test "x$enable_openbmc_unified_sel" = "xyes" || test "x$enable_all" = "xyes"], [ + AC_DEFINE(HAVE_OPENBMC_UNIFIED_SEL,1,"have OpenBMC unified SEL") + AC_SUBST([WITH_OPENBMC_UNIFIED_SEL]) +]) +AM_CONDITIONAL([WITH_OPENBMC_UNIFIED_SEL], [test x$enable_openbmc_unified_sel = xyes || test x$enable_all = xyes]) +AM_COND_IF([WITH_OPENBMC_UNIFIED_SEL], [USE_OPENBMC_UNIFIED_SEL="yes"], [USE_OPENBMC_UNIFIED_SEL="no"]) + AC_ARG_ENABLE([jaguar_ns_decode], AS_HELP_STRING([--enable-jaguar-ns-decode], [enable JAGUAR_NS_DECODE events (currently experimental)])) @@ -263,6 +273,7 @@ compile time options summary CXL events : $USE_CXL Memory CE PFA : $USE_MEMORY_CE_PFA AMP RAS errors : $USE_AMP_NS_DECODE + OpenBMC unified : $USE_OPENBMC_UNIFIED_SEL CPU fault isolation : $USE_CPU_FAULT_ISOLATION YITIAN RAS errors : $USE_YITIAN_NS_DECODE JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE diff --git a/ras-aer-handler.c b/ras-aer-handler.c index cf3ecd3..b4b5b35 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c @@ -14,6 +14,7 @@ #include "ras-aer-handler.h" #include "ras-logger.h" #include "ras-report.h" +#include "unified-sel.h" #include "types.h" /* bit field meaning for correctable error */ @@ -25,13 +26,16 @@ static const char *aer_cor_errors[32] = { [8] = "RELAY_NUM Rollover", [12] = "Replay Timer Timeout", [13] = "Advisory Non-Fatal", - [14] = "Corrected Internal Error", + [14] = "Corrected Internal", + [15] = "Header Log Overflow", + [16] = "Corrected Internal Error", }; /* bit field meaning for uncorrectable error */ static const char *aer_uncor_errors[32] = { /* Uncorrectable errors */ [4] = "Data Link Protocol", + [5] = "Surprise Link Down", [12] = "Poisoned TLP", [13] = "Flow Control Protocol", [14] = "Completion Timeout", @@ -41,8 +45,23 @@ static const char *aer_uncor_errors[32] = { [18] = "Malformed TLP", [19] = "ECRC", [20] = "Unsupported Request", + [21] = "ACS Violation", + [22] = "Uncorrected Internal", + [23] = "MC Blocked TLP", + [24] = "AtomicOp Egress Blocked", + [25] = "TLP Prefix Blocked", + [26] = "Poisoned TLP Egrees Blocked", }; +static bool use_ipmitool = false; + +void ras_aer_handler_init(int enable_ipmitool) +{ +#ifdef HAVE_OPENBMC_UNIFIED_SEL + use_ipmitool = (enable_ipmitool > 0) ? 1 : 0; +#endif +} + #define BUF_LEN 1024 int ras_aer_event_handler(struct trace_seq *s, @@ -185,5 +204,11 @@ int ras_aer_event_handler(struct trace_seq *s, log(SYSLOG, LOG_WARNING, "Failed to execute ipmitool\n"); #endif +#ifdef HAVE_OPENBMC_UNIFIED_SEL + if (use_ipmitool) + if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0) + return -1; +#endif + return 0; } diff --git a/ras-aer-handler.h b/ras-aer-handler.h index 5eee690..ef84788 100644 --- a/ras-aer-handler.h +++ b/ras-aer-handler.h @@ -15,4 +15,5 @@ int ras_aer_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); +void ras_aer_handler_init(int enable_ipmitool); #endif diff --git a/ras-events.c b/ras-events.c index b6e80b2..13a7bdb 100644 --- a/ras-events.c +++ b/ras-events.c @@ -912,7 +912,7 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, return 0; } -int handle_ras_events(int record_events) +int handle_ras_events(int record_events, int enable_ipmitool) { int rc, page_size, i; int num_events = 0; @@ -967,6 +967,7 @@ int handle_ras_events(int record_events) "ras", "mc_event"); #ifdef HAVE_AER + ras_aer_handler_init(enable_ipmitool); rc = add_event_handler(ras, pevent, page_size, "ras", "aer_event", ras_aer_event_handler, NULL, AER_EVENT); if (!rc) diff --git a/ras-events.h b/ras-events.h index 47cc524..87522e1 100644 --- a/ras-events.h +++ b/ras-events.h @@ -97,7 +97,8 @@ enum ghes_severity { /* Function prototypes */ int toggle_ras_mc_event(int enable); +int handle_ras_events(int record_events, int enable_ipmitool); int ras_offline_mce_event(struct ras_mc_offline_event *event); -int handle_ras_events(int record_events); +int handle_ras_events(int record_events, int enable_ipmitool); #endif diff --git a/rasdaemon.c b/rasdaemon.c index 95f997d..4234557 100644 --- a/rasdaemon.c +++ b/rasdaemon.c @@ -30,6 +30,7 @@ const char *argp_program_bug_address = "Mauro Carvalho Chehab record_events++; break; +#endif +#ifdef HAVE_OPENBMC_UNIFIED_SEL + case 'i': + args->enable_ipmitool++; + break; #endif case 'f': args->foreground++; @@ -152,6 +158,8 @@ int main(int argc, char *argv[]) {"record", 'r', 0, 0, "record events via sqlite3", 0}, #endif {"foreground", 'f', 0, 0, "run foreground, not daemonize"}, +#ifdef HAVE_OPENBMC_UNIFIED_SEL + {"ipmitool", 'i', 0, 0, "enable ipmitool logging", 0}, #ifdef HAVE_MCE {"post-processing", 'p', 0, 0, "Post-processing MCE's with raw register values"}, @@ -200,7 +208,7 @@ int main(int argc, char *argv[]) if (daemon(0, 0)) exit(EXIT_FAILURE); - handle_ras_events(args.record_events); + handle_ras_events(args.record_events, args.enable_ipmitool); return 0; } diff --git a/unified-sel.c b/unified-sel.c new file mode 100644 index 0000000..2fcdcc0 --- /dev/null +++ b/unified-sel.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2023, Meta Platforms Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include "ras-record.h" +#include "ras-logger.h" +#include "ras-report.h" +#include "unified-sel.h" + +/* CPU Root Port Error ID corresponding to each status bit set */ +static const char *cor_error_ids[32] = { + /* Correctable errors */ + [0] = "0x00", /* Receiver Error */ + [6] = "0x01", /* Bad TLP */ + [7] = "0x02", /* Bad DLLP */ + [8] = "0x04", /* RELAY_NUM Rollover */ + [12] = "0x03", /* Replay Timer Timeout */ + [13] = "0x05", /* Advisory Non-Fatal */ + [14] = "0x06", /* Corrected Internal */ + [15] = "0x07", /* Header Log Overflow */ +}; + +static const char *uncor_error_ids[32] = { + /* Uncorrectable errors */ + [4] = "0x20", /* Data Link Protocol */ + [5] = "0x21", /* Surprise Link Down */ + [12] = "0x22", /* Poisoned TLP */ + [13] = "0x23", /* Flow Control Protocol */ + [14] = "0x24", /* Completion Timeout */ + [15] = "0x25", /* Completer Abort */ + [16] = "0x26", /* Unexpected Completion */ + [17] = "0x27", /* Receiver Overflow */ + [18] = "0x29", /* Malformed TLP */ + [19] = "0x29", /* ECRC */ + [20] = "0x2A", /* Unsupported Request */ + [21] = "0x2B", /* ACS Violation */ + [22] = "0x2C", /* Uncorrected Internal */ + [23] = "0x2D", /* MC Blocked TLP */ + [24] = "0x2E", /* AtomicOp Egress Blocked */ + [25] = "0x2F", /* TLP Prefix Blocked */ + [26] = "0x30", /* Poisoned TLP Egrees Blocked */ +}; + +static int verify_id_log_sel(uint64_t status, + const char **idarray, + unsigned bus, + unsigned dev_fn) +{ + int i; + char openbmc_ipmi_add_sel[105]; + + /* + * Get PCIe AER error source bus/dev/fn and save it to the BMC SEL + * as a OpenBMC unified SEL record type. + * The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec) + * ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6 + * The 16 byte that follow form the SEL Record + * defined in IPMI spec chapter 32.1 "SEL Event Records" + * Byte 1~2 are Record ID = 0x00 0x00, unused + * Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL + * Byte 4~16 are OEM defined + * Byte 11: + * Byte11[7:3] Device# + * Byte11[2:0] Function# + * Byte 12: Bus number + * Byte 13-15: Reserved + * Byte 16: ID of the error detected on the PCle device that triggered this SEL record + */ + + /* Potentially all error status bits could be set for a given PCIe device. + * Therefore, iterate over all 32 bits each of cor and uncor errors + */ + for (i = 0; i < 32; i++) { + if ((status & (1 << i)) && idarray[i]) { + sprintf(openbmc_ipmi_add_sel, + "ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff %s", + dev_fn, bus, idarray[i]); + if (system(openbmc_ipmi_add_sel) != 0) + return -1; + } + } + return 0; +} + +int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status) +{ + int bus, dev, dev_fn, fn; + + sscanf(dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn); + dev_fn = (((dev & 0x1f) << 3) | (fn & 0x7)); + + /* Use the appropriate correctable error status ID + * for a given severity level + */ + if (severity == HW_EVENT_AER_CORRECTED) { + if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0) + return -1; + } + return 0; +} diff --git a/unified-sel.h b/unified-sel.h new file mode 100644 index 0000000..17458a5 --- /dev/null +++ b/unified-sel.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2023, Meta Platforms Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + + +#ifndef _UNIFIED_SEL_H +#define _UNIFIED_SEL_H + +int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status); + +#endif