Skip to content

Commit

Permalink
ipmitool SEL logging of AER CEs on OpenBMC platforms
Browse files Browse the repository at this point in the history
Signed-off-by: Krishna Dhulipala <[email protected]>
Signed-off-by: Mauro Carvalho Chehab <[email protected]>
  • Loading branch information
krishnanadh authored and mchehab committed Nov 18, 2024
1 parent d22ed5e commit abb10f6
Show file tree
Hide file tree
Showing 9 changed files with 160 additions and 5 deletions.
6 changes: 4 additions & 2 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ endif
if WITH_CPU_FAULT_ISOLATION
rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
endif

if WITH_OPENBMC_UNIFIED_SEL
rasdaemon_SOURCES += unified-sel.c
endif
if WITH_CXL
rasdaemon_SOURCES += ras-cxl-handler.c
endif
Expand All @@ -98,7 +100,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
non-standard-jaguarmicro.h trigger.h
non-standard-jaguarmicro.h trigger.h unified-sel.h

# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
Expand Down
11 changes: 11 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" = "xyes"],
AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all = xyes])
AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])

AC_ARG_ENABLE([openbmc_unified_sel],
AS_HELP_STRING([--enable-openbmc-unified-sel], [enable OPENBMC_UNIFIED_SEL events (currently experimental)]))

AS_IF([test "x$enable_openbmc_unified_sel" = "xyes" || test "x$enable_all" = "xyes"], [
AC_DEFINE(HAVE_OPENBMC_UNIFIED_SEL,1,"have OpenBMC unified SEL")
AC_SUBST([WITH_OPENBMC_UNIFIED_SEL])
])
AM_CONDITIONAL([WITH_OPENBMC_UNIFIED_SEL], [test x$enable_openbmc_unified_sel = xyes || test x$enable_all = xyes])
AM_COND_IF([WITH_OPENBMC_UNIFIED_SEL], [USE_OPENBMC_UNIFIED_SEL="yes"], [USE_OPENBMC_UNIFIED_SEL="no"])

AC_ARG_ENABLE([jaguar_ns_decode],
AS_HELP_STRING([--enable-jaguar-ns-decode], [enable JAGUAR_NS_DECODE events (currently experimental)]))

Expand Down Expand Up @@ -276,6 +286,7 @@ compile time options summary
Memory CE PFA : $USE_MEMORY_CE_PFA
Memory ROW CE PFA : $USE_MEMORY_ROW_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
OpenBMC unified : $USE_OPENBMC_UNIFIED_SEL
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
YITIAN RAS errors : $USE_YITIAN_NS_DECODE
JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE
Expand Down
24 changes: 24 additions & 0 deletions ras-aer-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "ras-aer-handler.h"
#include "ras-logger.h"
#include "ras-report.h"
#include "unified-sel.h"
#include "types.h"

/* bit field meaning for correctable error */
Expand All @@ -26,12 +27,14 @@ static const char *aer_cor_errors[32] = {
[12] = "Replay Timer Timeout",
[13] = "Advisory Non-Fatal",
[14] = "Corrected Internal Error",
[15] = "Header Log Overflow",
};

/* bit field meaning for uncorrectable error */
static const char *aer_uncor_errors[32] = {
/* Uncorrectable errors */
[4] = "Data Link Protocol",
[5] = "Surprise Link Down",
[12] = "Poisoned TLP",
[13] = "Flow Control Protocol",
[14] = "Completion Timeout",
Expand All @@ -41,8 +44,23 @@ static const char *aer_uncor_errors[32] = {
[18] = "Malformed TLP",
[19] = "ECRC",
[20] = "Unsupported Request",
[21] = "ACS Violation",
[22] = "Uncorrected Internal",
[23] = "MC Blocked TLP",
[24] = "AtomicOp Egress Blocked",
[25] = "TLP Prefix Blocked",
[26] = "Poisoned TLP Egrees Blocked",
};

static bool use_ipmitool = false;

void ras_aer_handler_init(int enable_ipmitool)
{
#ifdef HAVE_OPENBMC_UNIFIED_SEL
use_ipmitool = (enable_ipmitool > 0) ? 1 : 0;
#endif
}

#define BUF_LEN 1024

int ras_aer_event_handler(struct trace_seq *s,
Expand Down Expand Up @@ -185,5 +203,11 @@ int ras_aer_event_handler(struct trace_seq *s,
log(SYSLOG, LOG_WARNING, "Failed to execute ipmitool\n");
#endif

#ifdef HAVE_OPENBMC_UNIFIED_SEL
if (use_ipmitool)
if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0)
return -1;
#endif

return 0;
}
1 change: 1 addition & 0 deletions ras-aer-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ int ras_aer_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);

void ras_aer_handler_init(int enable_ipmitool);
#endif
3 changes: 2 additions & 1 deletion ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -925,7 +925,7 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
return 0;
}

int handle_ras_events(int record_events)
int handle_ras_events(int record_events, int enable_ipmitool)
{
int rc, page_size, i;
int num_events = 0;
Expand Down Expand Up @@ -984,6 +984,7 @@ int handle_ras_events(int record_events)
"ras", "mc_event");

#ifdef HAVE_AER
ras_aer_handler_init(enable_ipmitool);
rc = add_event_handler(ras, pevent, page_size, "ras", "aer_event",
ras_aer_event_handler, NULL, AER_EVENT);
if (!rc)
Expand Down
3 changes: 2 additions & 1 deletion ras-events.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ enum ghes_severity {

/* Function prototypes */
int toggle_ras_mc_event(int enable);
int handle_ras_events(int record_events, int enable_ipmitool);
int ras_offline_mce_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);
int handle_ras_events(int record_events, int enable_ipmitool);

#endif
11 changes: 10 additions & 1 deletion rasdaemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const char *argp_program_bug_address = "Mauro Carvalho Chehab <[email protected]
struct arguments {
int record_events;
int enable_ras;
int enable_ipmitool;
int foreground;
int offline;
};
Expand Down Expand Up @@ -61,6 +62,11 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state)
case 'r':
args->record_events++;
break;
#endif
#ifdef HAVE_OPENBMC_UNIFIED_SEL
case 'i':
args->enable_ipmitool++;
break;
#endif
case 'f':
args->foreground++;
Expand Down Expand Up @@ -152,6 +158,9 @@ int main(int argc, char *argv[])
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
#ifdef HAVE_OPENBMC_UNIFIED_SEL
{"ipmitool", 'i', 0, 0, "enable ipmitool logging", 0},
#endif
#ifdef HAVE_MCE
{"post-processing", 'p', 0, 0,
"Post-processing MCE's with raw register values"},
Expand Down Expand Up @@ -200,7 +209,7 @@ int main(int argc, char *argv[])
if (daemon(0, 0))
exit(EXIT_FAILURE);

handle_ras_events(args.record_events);
handle_ras_events(args.record_events, args.enable_ipmitool);

return 0;
}
89 changes: 89 additions & 0 deletions unified-sel.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright (c) 2023, Meta Platforms Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-report.h"
#include "unified-sel.h"

/* CPU Root Port Error ID corresponding to each status bit set */
static const char *cor_error_ids[32] = {
/* Correctable errors */
[0] = "0x00", /* Receiver Error */
[6] = "0x01", /* Bad TLP */
[7] = "0x02", /* Bad DLLP */
[8] = "0x04", /* RELAY_NUM Rollover */
[12] = "0x03", /* Replay Timer Timeout */
[13] = "0x05", /* Advisory Non-Fatal */
[14] = "0x06", /* Corrected Internal */
[15] = "0x07", /* Header Log Overflow */
};

static int verify_id_log_sel(uint64_t status,
const char **idarray,
unsigned bus,
unsigned dev_fn)
{
int i;
char openbmc_ipmi_add_sel[105];

/*
* Get PCIe AER error source bus/dev/fn and save it to the BMC SEL
* as a OpenBMC unified SEL record type.
* The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec)
* ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6
* The 16 byte that follow form the SEL Record
* defined in IPMI spec chapter 32.1 "SEL Event Records"
* Byte 1~2 are Record ID = 0x00 0x00, unused
* Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL
* Byte 4~16 are OEM defined
* Byte 11:
* Byte11[7:3] Device#
* Byte11[2:0] Function#
* Byte 12: Bus number
* Byte 13-15: Reserved
* Byte 16: ID of the error detected on the PCle device that triggered this SEL record
*/

/* Potentially all error status bits could be set for a given PCIe device.
* Therefore, iterate over all 32 bits each of cor and uncor errors
*/
for (i = 0; i < 32; i++) {
if ((status & (1 << i)) && idarray[i]) {
sprintf(openbmc_ipmi_add_sel,
"ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff %s",
dev_fn, bus, idarray[i]);
if (system(openbmc_ipmi_add_sel) != 0)
return -1;
}
}
return 0;
}

int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status)
{
int bus, dev, dev_fn, fn;

sscanf(dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn);
dev_fn = (((dev & 0x1f) << 3) | (fn & 0x7));

/* Use the appropriate correctable error status ID
* for a given severity level
* */
if (severity == HW_EVENT_AER_CORRECTED) {
if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0)
return -1;
}
return 0;
}
17 changes: 17 additions & 0 deletions unified-sel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
* Copyright (c) 2023, Meta Platforms Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*/


#ifndef _UNIFIED_SEL_H
#define _UNIFIED_SEL_H

int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status);

#endif

0 comments on commit abb10f6

Please sign in to comment.