diff --git a/Makefile.am b/Makefile.am index 9dd42c9..8b35cc5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ all-local: $(SYSTEMD_SERVICES) sbin_PROGRAMS = rasdaemon rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ - bitfield.c + bitfield.c trigger.c if WITH_SQLITE3 rasdaemon_SOURCES += ras-record.c endif @@ -93,7 +93,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ - non-standard-jaguarmicro.h + non-standard-jaguarmicro.h trigger.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that @@ -120,6 +120,4 @@ upload: # custom target install-data-local: $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" -if WITH_MEMORY_CE_PFA $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" -endif diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger new file mode 100755 index 0000000..5c6ccfa --- /dev/null +++ b/contrib/mc_event_trigger @@ -0,0 +1,24 @@ +#!/bin/sh +# This shell script can be executed by rasdaemon in daemon mode when a +# mc_event is occured, environment variables include all information +# reported by tracepoint. +# +# environment: +# TIMESTAMP Timestamp when error occurred +# COUNT Number of errors of the same type +# TYPE Error type from Corrected/Uncorrected +# MESSAGE Error message +# LABEL Label of the affected DIMM(s) +# MC_INDEX DIMM identifier from DMI/SMBIOS if available +# TOP_LAYER Top layer of the error +# MIDDLE_LAYER Middle layer of the error +# LOWER_LAYER Low layer of the error +# ADDRESS Error address +# GRAIN Minimum granularity for an error report, in bytes +# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable) +# DRIVER_DETAIL Other driver-specific detail about the error +# + +[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local + +exit 0 diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env index 7cb18e8..3389a73 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env @@ -43,4 +43,20 @@ CPU_CE_THRESHOLD="18" CPU_ISOLATION_CYCLE="24h" # Prevent excessive isolation from causing an avalanche effect -CPU_ISOLATION_LIMIT="10" \ No newline at end of file +CPU_ISOLATION_LIMIT="10" + +# Event Trigger + +# Event trigger will be executed when the specified event occurs. +# +# Execute triggers path +# For example: TRIGGER_DIR=/etc/ras/triggers +TRIGGER_DIR= + +# Execute these triggers when the mc_event occured, the triggers will not +# be executed if the trigger is not specified. +# For example: +# MC_CE_TRIGGER=mc_event_trigger +# MC_UE_TRIGGER=mc_event_trigger +MC_CE_TRIGGER= +MC_UE_TRIGGER= diff --git a/ras-events.c b/ras-events.c index a097238..4c8dd8c 100644 --- a/ras-events.c +++ b/ras-events.c @@ -45,6 +45,7 @@ #include "ras-logger.h" #include "ras-page-isolation.h" #include "ras-cpu-isolation.h" +#include "trigger.h" /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never @@ -62,6 +63,10 @@ extern char *choices_disable; +const static struct event_trigger event_triggers[] = { + { "mc_event", &mc_event_trigger_setup }, +}; + static int get_debugfs_dir(char *tracing_dir, size_t len) { FILE *fp; @@ -277,6 +282,16 @@ int toggle_ras_mc_event(int enable) return rc; } +static void setup_event_trigger(char *event) +{ + struct event_trigger trigger; + for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) { + trigger = event_triggers[i]; + if (!strcmp(event, trigger.name)) + trigger.setup(); + } +} + #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0) /* * Set kernel filter. libtrace doesn't provide an API for setting filters @@ -871,6 +886,8 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, return EINVAL; } + setup_event_trigger(event); + log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event); return 0; diff --git a/ras-mc-handler.c b/ras-mc-handler.c index d93ba57..2f06a01 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c @@ -15,16 +15,91 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#define _GNU_SOURCE #include #include #include #include #include +#include #include "ras-mc-handler.h" #include "ras-record.h" #include "ras-logger.h" #include "ras-page-isolation.h" #include "ras-report.h" +#include "trigger.h" + +#define MAX_ENV 30 +static char *mc_ce_trigger; +static char *mc_ue_trigger; + +void mc_event_trigger_setup(void) +{ + mc_ce_trigger = getenv("MC_CE_TRIGGER"); + if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "") + || trigger_check(mc_ce_trigger) < 0) { + log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n", + mc_ce_trigger); + } else + log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n", + mc_ce_trigger); + + mc_ue_trigger = getenv("MC_UE_TRIGGER"); + if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "") + || trigger_check(mc_ue_trigger) < 0) { + log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n", + mc_ue_trigger); + } else + log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n", + mc_ue_trigger); +} + +static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger) +{ + char *env[MAX_ENV]; + int ei = 0; + int i; + + if (!mc_trigger || !strcmp(mc_trigger, "")) + return; + + if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) + goto free; + if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) + goto free; + if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) + goto free; + if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) + goto free; + if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) + goto free; + if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) + goto free; + if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) + goto free; + if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) + goto free; + if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) + goto free; + if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) + goto free; + if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) + goto free; + if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) + goto free; + if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) + goto free; + if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) + goto free; + env[ei] = NULL; + assert(ei < MAX_ENV); + + run_trigger(mc_trigger, NULL, env, "mc_event"); + +free: + for (i = 0; i < ei; i++) + free(env[i]); +} int ras_mc_event_handler(struct trace_seq *s, struct tep_record *record, @@ -194,6 +269,12 @@ int ras_mc_event_handler(struct trace_seq *s, ras_report_mc_event(ras, &ev); #endif + if (!strcmp(ev.error_type, "Corrected")) + run_mc_trigger(&ev, mc_ce_trigger); + + if (!strcmp(ev.error_type, "Uncorrected")) + run_mc_trigger(&ev, mc_ue_trigger); + return 0; parse_error: diff --git a/ras-mc-handler.h b/ras-mc-handler.h index afc0005..a7637b2 100644 --- a/ras-mc-handler.h +++ b/ras-mc-handler.h @@ -22,6 +22,8 @@ #include "ras-events.h" #include +void mc_event_trigger_setup(void); + int ras_mc_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); diff --git a/trigger.c b/trigger.c new file mode 100644 index 0000000..48c88ea --- /dev/null +++ b/trigger.c @@ -0,0 +1,61 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include "ras-logger.h" +#include "trigger.h" + +void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter) +{ + pid_t child; + char *path; + int status; + char *trigger_dir = getenv("TRIGGER_DIR"); + + + log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); + + if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0) + return; + + child = fork(); + if (child < 0) { + log(SYSLOG, LOG_ERR, "Cannot create process for trigger"); + return; + } + + if (child == 0) { + execve(path, argv, env); + _exit(127); + } else { + waitpid(child, &status, 0); + if (WIFEXITED(status) && WEXITSTATUS(status)) { + log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d", + trigger, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d", + trigger, WTERMSIG(status)); + } + } +} + +int trigger_check(char *s) +{ + char *name; + int rc; + char *trigger_dir = getenv("TRIGGER_DIR"); + + if (trigger_dir) { + if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) + return -1; + } else + name = s; + + rc = access(name, R_OK|X_OK); + + if (trigger_dir) + free(name); + + return rc; +} diff --git a/trigger.h b/trigger.h new file mode 100644 index 0000000..556a7f2 --- /dev/null +++ b/trigger.h @@ -0,0 +1,13 @@ +#ifndef __TRIGGER_H__ +#define __TRIGGER_H__ + +struct event_trigger { + const char *name; + void (*setup)(void); +}; + +int trigger_check(char *s); +void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter); + + +#endif