Skip to content

Commit

Permalink
rasdaemon: add mc_event trigger
Browse files Browse the repository at this point in the history
Allow users to run a trigger when RAS mc_event occurs, The mc_event
trigger is separated into CE trigger and UE trigger, this is because
CE is more frequent than UE, and the CE trigger will lead to more
performance hits. Users can choose different triggers for CE/UE to
reduce this effect.

Users can config trigger in /etc/sysconfig/rasdaemon:

    TRIGGER_DIR: The trigger diretory
    MC_CE_TRIGGER: The script executed when corrected error occurs.
    MC_UE_TRIGGER: The script executed when uncorrected error occurs.

No script will be executed if MC_CE_TRIGGER/MC_UE_TRIGGER is null.

Signed-off-by: Ruidong Tian <[email protected]>
  • Loading branch information
Ruidong Tian committed May 31, 2024
1 parent f9cb13b commit ab680ff
Show file tree
Hide file tree
Showing 8 changed files with 219 additions and 5 deletions.
8 changes: 4 additions & 4 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ all-local: $(SYSTEMD_SERVICES)

sbin_PROGRAMS = rasdaemon
rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
bitfield.c
bitfield.c trigger.c
if WITH_SQLITE3
rasdaemon_SOURCES += ras-record.c
endif
Expand Down Expand Up @@ -93,7 +93,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
non-standard-jaguarmicro.h
non-standard-jaguarmicro.h trigger.h

# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
Expand All @@ -120,6 +120,6 @@ upload:
# custom target
install-data-local:
$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
if WITH_MEMORY_CE_PFA
$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers"
$(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
endif
$(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger"
24 changes: 24 additions & 0 deletions contrib/mc_event_trigger
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh
# This shell script can be executed by rasdaemon in daemon mode when a
# mc_event is occured, environment variables include all information
# reported by tracepoint.
#
# environment:
# TIMESTAMP Timestamp when error occurred
# COUNT Number of errors of the same type
# TYPE Error type from Corrected/Uncorrected
# MESSAGE Error message
# LABEL Label of the affected DIMM(s)
# MC_INDEX DIMM identifier from DMI/SMBIOS if available
# TOP_LAYER Top layer of the error
# MIDDLE_LAYER Middle layer of the error
# LOWER_LAYER Low layer of the error
# ADDRESS Error address
# GRAIN Minimum granularity for an error report, in bytes
# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable)
# DRIVER_DETAIL Other driver-specific detail about the error
#

[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local

exit 0
18 changes: 17 additions & 1 deletion misc/rasdaemon.env
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,20 @@ CPU_CE_THRESHOLD="18"
CPU_ISOLATION_CYCLE="24h"

# Prevent excessive isolation from causing an avalanche effect
CPU_ISOLATION_LIMIT="10"
CPU_ISOLATION_LIMIT="10"

# Event Trigger

# Event trigger will be executed when the specified event occurs.
#
# Execute triggers path
# For example: TRIGGER_DIR=/etc/ras/triggers
TRIGGER_DIR=

# Execute these triggers when the mc_event occured, the triggers will not
# be executed if the trigger is not specified.
# For example:
# MC_CE_TRIGGER=mc_event_trigger
# MC_UE_TRIGGER=mc_event_trigger
MC_CE_TRIGGER=
MC_UE_TRIGGER=
17 changes: 17 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-cpu-isolation.h"
#include "trigger.h"

/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
Expand All @@ -62,6 +63,10 @@

extern char *choices_disable;

const static struct event_trigger event_triggers[] = {
{ "mc_event", &mc_event_trigger_setup },
};

static int get_debugfs_dir(char *tracing_dir, size_t len)
{
FILE *fp;
Expand Down Expand Up @@ -277,6 +282,16 @@ int toggle_ras_mc_event(int enable)
return rc;
}

static void setup_event_trigger(char *event)
{
struct event_trigger trigger;
for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) {
trigger = event_triggers[i];
if (!strcmp(event, trigger.name))
trigger.setup();
}
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0)
/*
* Set kernel filter. libtrace doesn't provide an API for setting filters
Expand Down Expand Up @@ -871,6 +886,8 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
return EINVAL;
}

setup_event_trigger(event);

log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event);

return 0;
Expand Down
81 changes: 81 additions & 0 deletions ras-mc-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,91 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <traceevent/kbuffer.h>
#include <assert.h>
#include "ras-mc-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-report.h"
#include "trigger.h"

#define MAX_ENV 30
static char *mc_ce_trigger;
static char *mc_ue_trigger;

void mc_event_trigger_setup(void)
{
mc_ce_trigger = getenv("MC_CE_TRIGGER");
if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "")
|| trigger_check(mc_ce_trigger) < 0) {
log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n",
mc_ce_trigger);
} else
log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n",
mc_ce_trigger);

mc_ue_trigger = getenv("MC_UE_TRIGGER");
if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "")
|| trigger_check(mc_ue_trigger) < 0) {
log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n",
mc_ue_trigger);
} else
log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n",
mc_ue_trigger);
}

static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger)
{
char *env[MAX_ENV];
int ei = 0;
int i;

if (!mc_trigger || !strcmp(mc_trigger, ""))
return;

if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
goto free;
if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
goto free;
if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0)
goto free;
if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0)
goto free;
if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0)
goto free;
if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0)
goto free;
if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0)
goto free;
if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0)
goto free;
if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0)
goto free;
if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0)
goto free;
if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0)
goto free;
if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0)
goto free;
if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0)
goto free;
if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0)
goto free;
env[ei] = NULL;
assert(ei < MAX_ENV);

run_trigger(mc_trigger, NULL, env, "mc_event");

free:
for (i = 0; i < ei; i++)
free(env[i]);
}

int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
Expand Down Expand Up @@ -194,6 +269,12 @@ int ras_mc_event_handler(struct trace_seq *s,
ras_report_mc_event(ras, &ev);
#endif

if (!strcmp(ev.error_type, "Corrected"))
run_mc_trigger(&ev, mc_ce_trigger);

if (!strcmp(ev.error_type, "Uncorrected"))
run_mc_trigger(&ev, mc_ue_trigger);

return 0;

parse_error:
Expand Down
2 changes: 2 additions & 0 deletions ras-mc-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "ras-events.h"
#include <traceevent/event-parse.h>

void mc_event_trigger_setup(void);

int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
Expand Down
61 changes: 61 additions & 0 deletions trigger.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/wait.h>
#include "ras-logger.h"
#include "trigger.h"

void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter)
{
pid_t child;
char *path;
int status;
char *trigger_dir = getenv("TRIGGER_DIR");


log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter);

if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0)
return;

child = fork();
if (child < 0) {
log(SYSLOG, LOG_ERR, "Cannot create process for trigger");
return;
}

if (child == 0) {
execve(path, argv, env);
_exit(127);
} else {
waitpid(child, &status, 0);
if (WIFEXITED(status) && WEXITSTATUS(status)) {
log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d",
trigger, WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d",
trigger, WTERMSIG(status));
}
}
}

int trigger_check(char *s)
{
char *name;
int rc;
char *trigger_dir = getenv("TRIGGER_DIR");

if (trigger_dir) {
if (asprintf(&name, "%s/%s", trigger_dir, s) < 0)
return -1;
} else
name = s;

rc = access(name, R_OK|X_OK);

if (trigger_dir)
free(name);

return rc;
}
13 changes: 13 additions & 0 deletions trigger.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef __TRIGGER_H__
#define __TRIGGER_H__

struct event_trigger {
const char *name;
void (*setup)(void);
};

int trigger_check(char *s);
void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter);


#endif

0 comments on commit ab680ff

Please sign in to comment.