Skip to content

Commit

Permalink
rasdaemon: add mc_event trigger
Browse files Browse the repository at this point in the history
Allow users to run a trigger when RAS mc_event occurs, The mc_event
trigger is separated to CE trigger and UE trigger, this is because CE
is more frequently than UE, CE trigger will lead to more performance
hits. Users can choose different trigger for CE/UE to reduce this
effect.

Users can config trigger in /etc/sysconfig/rasdaemon:

    TRIGGER_DIR: The trigger diretory
    MC_CE_TRIGGER: The script executed when corrected error occurs.
    MC_UE_TRIGGER: The script executed when uncorrected error occurs.

No script will be executed if MC_CE_TRIGGER/MC_UE_TRIGGER is null.

Signed-off-by: Ruidong Tian <[email protected]>
  • Loading branch information
Ruidong Tian committed Feb 26, 2024
1 parent f9cb13b commit a4c9676
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 5 deletions.
6 changes: 2 additions & 4 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ all-local: $(SYSTEMD_SERVICES)

sbin_PROGRAMS = rasdaemon
rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
bitfield.c
bitfield.c trigger.c
if WITH_SQLITE3
rasdaemon_SOURCES += ras-record.c
endif
Expand Down Expand Up @@ -93,7 +93,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
non-standard-jaguarmicro.h
non-standard-jaguarmicro.h trigger.h

# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
Expand All @@ -120,6 +120,4 @@ upload:
# custom target
install-data-local:
$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
if WITH_MEMORY_CE_PFA
$(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
endif
24 changes: 24 additions & 0 deletions contrib/mc_event_trigger
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh
# This shell script can be executed by rasdaemon in daemon mode when a
# mc_event is occured, environment variables include all information
# reported by tracepoint.
#
# environment:
# TIMESTAMP Timestamp when error occurred
# COUNT Number of errors of the same type
# TYPE Error type from Corrected/Uncorrected
# MESSAGE Error message
# LABEL Label of the affected DIMM(s)
# MC_INDEX DIMM identifier from DMI/SMBIOS if available
# TOP_LAYER Top layer of the error
# MIDDLE_LAYER Middle layer of the error
# LOWER_LAYER Low layer of the error
# ADDRESS Error address
# GRAIN Minimum granularity for an error report, in bytes
# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable)
# DRIVER_DETAIL Other driver-specific detail about the error
#

[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local

exit 0
18 changes: 17 additions & 1 deletion misc/rasdaemon.env
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,20 @@ CPU_CE_THRESHOLD="18"
CPU_ISOLATION_CYCLE="24h"

# Prevent excessive isolation from causing an avalanche effect
CPU_ISOLATION_LIMIT="10"
CPU_ISOLATION_LIMIT="10"

# Event Trigger

# Event trigger will be executed when the specified event occurs.
#
# Execute triggers path
# For example: TRIGGER_DIR=/etc/ras/triggers
TRIGGER_DIR=

# Execute these triggers when the mc_event occured, the triggers will not
# be executed if the trigger is not specified.
# For example:
# MC_CE_TRIGGER=mc_event_trigger
# MC_UE_TRIGGER=mc_event_trigger
MC_CE_TRIGGER=
MC_UE_TRIGGER=
17 changes: 17 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-cpu-isolation.h"
#include "trigger.h"

/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
Expand All @@ -62,6 +63,10 @@

extern char *choices_disable;

const static struct event_trigger event_triggers[] = {
{ "mc_event", &mc_event_trigger_setup },
};

static int get_debugfs_dir(char *tracing_dir, size_t len)
{
FILE *fp;
Expand Down Expand Up @@ -277,6 +282,16 @@ int toggle_ras_mc_event(int enable)
return rc;
}

static void setup_event_trigger(char *event)
{
struct event_trigger trigger;
for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) {
trigger = event_triggers[i];
if (!strcmp(event, trigger.name))
trigger.setup();
}
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0)
/*
* Set kernel filter. libtrace doesn't provide an API for setting filters
Expand Down Expand Up @@ -871,6 +886,8 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
return EINVAL;
}

setup_event_trigger(event);

log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event);

return 0;
Expand Down
81 changes: 81 additions & 0 deletions ras-mc-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,91 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <traceevent/kbuffer.h>
#include <assert.h>
#include "ras-mc-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-report.h"
#include "trigger.h"

#define MAX_ENV 30
static char *mc_ce_trigger;
static char *mc_ue_trigger;

void mc_event_trigger_setup(void)
{
mc_ce_trigger = getenv("MC_CE_TRIGGER");
if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "")
|| trigger_check(mc_ce_trigger) < 0) {
log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n",
mc_ce_trigger);
} else
log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n",
mc_ce_trigger);

mc_ue_trigger = getenv("MC_UE_TRIGGER");
if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "")
|| trigger_check(mc_ue_trigger) < 0) {
log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n",
mc_ue_trigger);
} else
log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n",
mc_ue_trigger);
}

static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger)
{
char *env[MAX_ENV];
int ei = 0;
int i;

if (!mc_trigger || !strcmp(mc_trigger, ""))
return;

if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
goto free;
if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
goto free;
if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0)
goto free;
if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0)
goto free;
if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0)
goto free;
if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0)
goto free;
if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0)
goto free;
if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0)
goto free;
if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0)
goto free;
if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0)
goto free;
if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0)
goto free;
if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0)
goto free;
if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0)
goto free;
if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0)
goto free;
env[ei] = NULL;
assert(ei < MAX_ENV);

run_trigger(mc_trigger, NULL, env, "mc_event");

free:
for (i = 0; i < ei; i++)
free(env[i]);
}

int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
Expand Down Expand Up @@ -194,6 +269,12 @@ int ras_mc_event_handler(struct trace_seq *s,
ras_report_mc_event(ras, &ev);
#endif

if (!strcmp(ev.error_type, "Corrected"))
run_mc_trigger(&ev, mc_ce_trigger);

if (!strcmp(ev.error_type, "Uncorrected"))
run_mc_trigger(&ev, mc_ue_trigger);

return 0;

parse_error:
Expand Down
2 changes: 2 additions & 0 deletions ras-mc-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "ras-events.h"
#include <traceevent/event-parse.h>

void mc_event_trigger_setup(void);

int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
Expand Down
61 changes: 61 additions & 0 deletions trigger.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/wait.h>
#include "ras-logger.h"
#include "trigger.h"

void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter)
{
pid_t child;
char *path;
int status;
char *trigger_dir = getenv("TRIGGER_DIR");


log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter);

if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0)
return;

child = fork();
if (child < 0) {
log(SYSLOG, LOG_ERR, "Cannot create process for trigger");
return;
}

if (child == 0) {
execve(path, argv, env);
_exit(127);
} else {
waitpid(child, &status, 0);
if (WIFEXITED(status) && WEXITSTATUS(status)) {
log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d",
trigger, WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d",
trigger, WTERMSIG(status));
}
}
}

int trigger_check(char *s)
{
char *name;
int rc;
char *trigger_dir = getenv("TRIGGER_DIR");

if (trigger_dir) {
if (asprintf(&name, "%s/%s", trigger_dir, s) < 0)
return -1;
} else
name = s;

rc = access(name, R_OK|X_OK);

if (trigger_dir)
free(name);

return rc;
}
13 changes: 13 additions & 0 deletions trigger.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef __TRIGGER_H__
#define __TRIGGER_H__

struct event_trigger {
const char *name;
void (*setup)(void);
};

int trigger_check(char *s);
void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter);


#endif

0 comments on commit a4c9676

Please sign in to comment.