Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rasdaemon: ipmitool SEL logging of AER CEs on OpenBMC platforms #97

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ endif
if WITH_CPU_FAULT_ISOLATION
rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
endif

if WITH_OPENBMC_UNIFIED_SEL
rasdaemon_SOURCES += unified-sel.c
endif
if WITH_CXL
rasdaemon_SOURCES += ras-cxl-handler.c
endif
Expand All @@ -98,7 +100,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
non-standard-jaguarmicro.h trigger.h
non-standard-jaguarmicro.h trigger.h unified-sel.h

# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
Expand Down
11 changes: 11 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" = "xyes"],
AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all = xyes])
AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])

AC_ARG_ENABLE([openbmc_unified_sel],
AS_HELP_STRING([--enable-openbmc-unified-sel], [enable OPENBMC_UNIFIED_SEL events (currently experimental)]))

AS_IF([test "x$enable_openbmc_unified_sel" = "xyes" || test "x$enable_all" = "xyes"], [
AC_DEFINE(HAVE_OPENBMC_UNIFIED_SEL,1,"have OpenBMC unified SEL")
AC_SUBST([WITH_OPENBMC_UNIFIED_SEL])
])
AM_CONDITIONAL([WITH_OPENBMC_UNIFIED_SEL], [test x$enable_openbmc_unified_sel = xyes || test x$enable_all = xyes])
AM_COND_IF([WITH_OPENBMC_UNIFIED_SEL], [USE_OPENBMC_UNIFIED_SEL="yes"], [USE_OPENBMC_UNIFIED_SEL="no"])

AC_ARG_ENABLE([jaguar_ns_decode],
AS_HELP_STRING([--enable-jaguar-ns-decode], [enable JAGUAR_NS_DECODE events (currently experimental)]))

Expand Down Expand Up @@ -263,6 +273,7 @@ compile time options summary
CXL events : $USE_CXL
Memory CE PFA : $USE_MEMORY_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
OpenBMC unified : $USE_OPENBMC_UNIFIED_SEL
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
YITIAN RAS errors : $USE_YITIAN_NS_DECODE
JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE
Expand Down
27 changes: 26 additions & 1 deletion ras-aer-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "ras-aer-handler.h"
#include "ras-logger.h"
#include "ras-report.h"
#include "unified-sel.h"
#include "types.h"

/* bit field meaning for correctable error */
Expand All @@ -25,13 +26,16 @@ static const char *aer_cor_errors[32] = {
[8] = "RELAY_NUM Rollover",
[12] = "Replay Timer Timeout",
[13] = "Advisory Non-Fatal",
[14] = "Corrected Internal Error",
[14] = "Corrected Internal",
[15] = "Header Log Overflow",
[16] = "Corrected Internal Error",
};

/* bit field meaning for uncorrectable error */
static const char *aer_uncor_errors[32] = {
/* Uncorrectable errors */
[4] = "Data Link Protocol",
[5] = "Surprise Link Down",
[12] = "Poisoned TLP",
[13] = "Flow Control Protocol",
[14] = "Completion Timeout",
Expand All @@ -41,8 +45,23 @@ static const char *aer_uncor_errors[32] = {
[18] = "Malformed TLP",
[19] = "ECRC",
[20] = "Unsupported Request",
[21] = "ACS Violation",
[22] = "Uncorrected Internal",
[23] = "MC Blocked TLP",
[24] = "AtomicOp Egress Blocked",
[25] = "TLP Prefix Blocked",
[26] = "Poisoned TLP Egrees Blocked",
};

static bool use_ipmitool = false;

void ras_aer_handler_init(int enable_ipmitool)
{
#ifdef HAVE_OPENBMC_UNIFIED_SEL
use_ipmitool = (enable_ipmitool > 0) ? 1 : 0;
#endif
}

#define BUF_LEN 1024

int ras_aer_event_handler(struct trace_seq *s,
Expand Down Expand Up @@ -185,5 +204,11 @@ int ras_aer_event_handler(struct trace_seq *s,
log(SYSLOG, LOG_WARNING, "Failed to execute ipmitool\n");
#endif

#ifdef HAVE_OPENBMC_UNIFIED_SEL
if (use_ipmitool)
if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0)
return -1;
#endif

return 0;
}
1 change: 1 addition & 0 deletions ras-aer-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ int ras_aer_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);

void ras_aer_handler_init(int enable_ipmitool);
#endif
3 changes: 2 additions & 1 deletion ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,7 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
return 0;
}

int handle_ras_events(int record_events)
int handle_ras_events(int record_events, int enable_ipmitool)
{
int rc, page_size, i;
int num_events = 0;
Expand Down Expand Up @@ -967,6 +967,7 @@ int handle_ras_events(int record_events)
"ras", "mc_event");

#ifdef HAVE_AER
ras_aer_handler_init(enable_ipmitool);
rc = add_event_handler(ras, pevent, page_size, "ras", "aer_event",
ras_aer_event_handler, NULL, AER_EVENT);
if (!rc)
Expand Down
3 changes: 2 additions & 1 deletion ras-events.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ enum ghes_severity {

/* Function prototypes */
int toggle_ras_mc_event(int enable);
int handle_ras_events(int record_events, int enable_ipmitool);
int ras_offline_mce_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);
int handle_ras_events(int record_events, int enable_ipmitool);

#endif
10 changes: 9 additions & 1 deletion rasdaemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const char *argp_program_bug_address = "Mauro Carvalho Chehab <[email protected]
struct arguments {
int record_events;
int enable_ras;
int enable_ipmitool;
int foreground;
int offline;
};
Expand Down Expand Up @@ -61,6 +62,11 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state)
case 'r':
args->record_events++;
break;
#endif
#ifdef HAVE_OPENBMC_UNIFIED_SEL
case 'i':
args->enable_ipmitool++;
break;
#endif
case 'f':
args->foreground++;
Expand Down Expand Up @@ -152,6 +158,8 @@ int main(int argc, char *argv[])
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
#ifdef HAVE_OPENBMC_UNIFIED_SEL
{"ipmitool", 'i', 0, 0, "enable ipmitool logging", 0},
#ifdef HAVE_MCE
{"post-processing", 'p', 0, 0,
"Post-processing MCE's with raw register values"},
Expand Down Expand Up @@ -200,7 +208,7 @@ int main(int argc, char *argv[])
if (daemon(0, 0))
exit(EXIT_FAILURE);

handle_ras_events(args.record_events);
handle_ras_events(args.record_events, args.enable_ipmitool);

return 0;
}
110 changes: 110 additions & 0 deletions unified-sel.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
* Copyright (c) 2023, Meta Platforms Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-report.h"
#include "unified-sel.h"

/* CPU Root Port Error ID corresponding to each status bit set */
static const char *cor_error_ids[32] = {
/* Correctable errors */
[0] = "0x00", /* Receiver Error */
[6] = "0x01", /* Bad TLP */
[7] = "0x02", /* Bad DLLP */
[8] = "0x04", /* RELAY_NUM Rollover */
[12] = "0x03", /* Replay Timer Timeout */
[13] = "0x05", /* Advisory Non-Fatal */
[14] = "0x06", /* Corrected Internal */
[15] = "0x07", /* Header Log Overflow */
};

static const char *uncor_error_ids[32] = {
/* Uncorrectable errors */
[4] = "0x20", /* Data Link Protocol */
[5] = "0x21", /* Surprise Link Down */
[12] = "0x22", /* Poisoned TLP */
[13] = "0x23", /* Flow Control Protocol */
[14] = "0x24", /* Completion Timeout */
[15] = "0x25", /* Completer Abort */
[16] = "0x26", /* Unexpected Completion */
[17] = "0x27", /* Receiver Overflow */
[18] = "0x29", /* Malformed TLP */
[19] = "0x29", /* ECRC */
[20] = "0x2A", /* Unsupported Request */
[21] = "0x2B", /* ACS Violation */
[22] = "0x2C", /* Uncorrected Internal */
[23] = "0x2D", /* MC Blocked TLP */
[24] = "0x2E", /* AtomicOp Egress Blocked */
[25] = "0x2F", /* TLP Prefix Blocked */
[26] = "0x30", /* Poisoned TLP Egrees Blocked */
};

static int verify_id_log_sel(uint64_t status,
const char **idarray,
unsigned bus,
unsigned dev_fn)
{
int i;
char openbmc_ipmi_add_sel[105];

/*
* Get PCIe AER error source bus/dev/fn and save it to the BMC SEL
* as a OpenBMC unified SEL record type.
* The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec)
* ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6
* The 16 byte that follow form the SEL Record
* defined in IPMI spec chapter 32.1 "SEL Event Records"
* Byte 1~2 are Record ID = 0x00 0x00, unused
* Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL
* Byte 4~16 are OEM defined
* Byte 11:
* Byte11[7:3] Device#
* Byte11[2:0] Function#
* Byte 12: Bus number
* Byte 13-15: Reserved
* Byte 16: ID of the error detected on the PCle device that triggered this SEL record
*/

/* Potentially all error status bits could be set for a given PCIe device.
* Therefore, iterate over all 32 bits each of cor and uncor errors
*/
for (i = 0; i < 32; i++) {
if ((status & (1 << i)) && idarray[i]) {
sprintf(openbmc_ipmi_add_sel,
"ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff %s",
dev_fn, bus, idarray[i]);
if (system(openbmc_ipmi_add_sel) != 0)
return -1;
}
}
return 0;
}

int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status)
{
int bus, dev, dev_fn, fn;

sscanf(dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn);
dev_fn = (((dev & 0x1f) << 3) | (fn & 0x7));

/* Use the appropriate correctable error status ID
* for a given severity level
*/
if (severity == HW_EVENT_AER_CORRECTED) {
if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0)
return -1;
}
return 0;
}
17 changes: 17 additions & 0 deletions unified-sel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
* Copyright (c) 2023, Meta Platforms Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/


#ifndef _UNIFIED_SEL_H
#define _UNIFIED_SEL_H

int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status);

#endif