Skip to content

Commit

Permalink
Update the code to use scontrol in place of slurm APIs to drain the
Browse files Browse the repository at this point in the history
node to make it slurm version independent
  • Loading branch information
mithun-pensando committed Jan 23, 2025
1 parent 16438ef commit 3b048a6
Show file tree
Hide file tree
Showing 6 changed files with 325 additions and 55 deletions.
27 changes: 15 additions & 12 deletions redfish-exporter/.env
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
UPDATED="2024-09-24"
UPDATED="2025-01-22"
DESCRIPTION="Redfish Event Listener/Exporter"
LISTENER_IP="0.0.0.0"
LISTENER_PORT="8080"
METRICS_PORT="2112"
LISTENER_IP="<Listener_IP>"
LISTENER_PORT="<PORT>"
METRICS_PORT="<MERTRICS_PORT>"
USE_SSL="false"
CERTFILE="path/to/certfile"
KEYFILE="path/to/keyfile"
SLURM_USER="slurm user here"
SLURM_TOKEN="token string here, from secret when for real"
SLURM_CONTROL_NODE="slurm control node IP:Port"
SLURM_CONTROL_NODE="<SLURM_CONTROL_NODE_IP>"
#List of '|' seperated reasons for avoiding drain action if there is a match
SLURM_DRAIN_EXCLUDE_REASON_LIST="reason 1|reason 2"
SLURM_SCONTROL_PATH="/usr/bin/scontrol"

# Match RAS events received based on severity and '|' seperated list of message fields and perform drain action with the DrainReasonPrefix set as the prefix in the reason
TRIGGER_EVENTS="[\
{\"Severity\":\"Fatal\",\"Action\":\"DrainNode\"},\
{\"Severity\":\"Critical\",\"Action\":\"DrainNode\"}
{\"Severity\":\"Critical\",\"Message\":\"message 1|This is a critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
{\"Severity\":\"Info\",\"Message\":\"message 3\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
{\"Severity\":\"Warning\",\"Message\":\"message 4|This is a test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
]"

# Subscription (v1.5+)
Expand All @@ -28,8 +31,8 @@ TRIGGER_EVENTS="[\

# Deprecated <v1.5
SUBSCRIPTION_PAYLOAD="{\
\"Destination\":\"http://host.docker.internal:8080\",\
\"EventTypes\":[\"Alert\",\"StatusChange\"],\
\"Destination\":\"http://<Listener_IP:Port>\",\
\"EventTypes\":[\"Alert\"],\
\"Protocol\":\"Redfish\",\
\"Context\":\"YourContextData\",\
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
Expand All @@ -41,5 +44,5 @@ PROMETHEUS_CONFIG="{\
}"

REDFISH_SERVERS="[\
{\"ip\":\"http://127.0.0.1:8000\",\"username\":\"Username1\",\"password\":\"Password1\",\"loginType\":\"Session\",\"slurmNode\":\"Node1\"}
{\"ip\":\"https://<BMC_IP>\",\"username\":\"<username>\",\"password\":\"<password>\",\"loginType\":\"Session\",\"slurmNode\":\"<nodename\"}
]"
30 changes: 18 additions & 12 deletions redfish-exporter/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,25 @@ type Config struct {
CertFile string
KeyFile string
}
SlurmToken string
SlurmControlNode string
SlurmUser string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
SlurmToken string
SlurmControlNode string
SlurmUser string
SlurmScontrolPath string
SlurmDrainExcludeStr string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
}

type TriggerEvent struct {
Severity string `json:"Severity"`
Action string `json:"Action"`
Severity string `json:"Severity"`
Action string `json:"Action"`
Message string `json:"Message"`
DrainReasonPrefix string `json:"DrainReasonPrefix"`
}

type PrometheusConfig struct {
Expand Down Expand Up @@ -119,6 +123,8 @@ func setupConfig() Config {
AppConfig.SlurmToken = os.Getenv("SLURM_TOKEN")
AppConfig.SlurmControlNode = os.Getenv("SLURM_CONTROL_NODE")
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")

subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {
Expand Down
24 changes: 22 additions & 2 deletions redfish-exporter/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"log"
"net"
"net/http"
"regexp"
"strings"

"github.com/nod-ai/ADA/redfish-exporter/metrics"
Expand Down Expand Up @@ -219,15 +220,34 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque
log.Printf("Origin Of Condition: %s", originOfCondition)
for _, triggerEvent := range AppConfig.TriggerEvents {
if severity == triggerEvent.Severity {
log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action)
if triggerEvent.Message != "" {
re := regexp.MustCompile(triggerEvent.Message)
match := re.FindAllString(message, -1)

if len(match) == 0 {
continue
}
}
log.Printf("Matched Trigger Event: %s | message: %s | with action %s", triggerEvent.Severity, triggerEvent.Message, triggerEvent.Action)
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
if s.slurmQueue != nil {
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action)
break
}
s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action)
evt := slurm.AddEventReq{
RedfishServerIP: redfishServerInfo.IP,
SlurmNodeName: redfishServerInfo.SlurmNode,
Severity: triggerEvent.Severity,
Action: triggerEvent.Action,
DrainReasonPrefix: triggerEvent.DrainReasonPrefix,
MessageId: messageId,
Message: message,
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
ScontrolPath: AppConfig.SlurmScontrolPath,
}
s.slurmQueue.Add(evt)
}
break
}
Expand Down
11 changes: 0 additions & 11 deletions redfish-exporter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,6 @@ func main() {
defer cancel()
var slurmQueue *slurm.SlurmQueue
if *enableSlurm {
if len(strings.TrimSpace(AppConfig.SlurmToken)) == 0 {
log.Fatalf("Provide slurm token to enable slurm")
}
if len(strings.TrimSpace(AppConfig.SlurmControlNode)) == 0 {
log.Fatalf("Provide slurm control node IP:Port to enable slurm")
}
_, err := slurm.NewClient(AppConfig.SlurmControlNode, AppConfig.SlurmUser, AppConfig.SlurmToken)
if err != nil {
log.Fatalf("failed to create slurm client, err: %+v", err)
}

slurmQueue = slurm.InitSlurmQueue(ctx)
go slurmQueue.ProcessEventActionQueue()
}
Expand Down
61 changes: 45 additions & 16 deletions redfish-exporter/slurm/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,40 @@ package slurm

import (
"context"
"fmt"
"log"
"strings"

"github.com/nod-ai/ADA/redfish-exporter/metrics"
)

const (
Drain = "DrainNode"
Drain = "DrainNode"
ExlcudeReasonSet = "DRAIN_EXCLUDE_REASON_SET"
)

type AddEventReq struct {
RedfishServerIP string
SlurmNodeName string
Severity string
Action string
DrainReasonPrefix string
MessageId string
Message string
ExcludeStr string
ScontrolPath string
}

type eventsActionReq struct {
redfishServerIP string
slurmNodeName string
severity string
action string
redfishServerIP string
slurmNodeName string
severity string
action string
drainReasonPrefix string
messageId string
message string
excludeStr string
scontrolPath string
}

type SlurmQueue struct {
Expand All @@ -28,12 +47,17 @@ func InitSlurmQueue(ctx context.Context) *SlurmQueue {
return &SlurmQueue{ctx: ctx, queue: make(chan *eventsActionReq)}
}

func (q *SlurmQueue) Add(redfishServerIP, slurmNodeName, severity, action string) {
func (q *SlurmQueue) Add(evt AddEventReq) {
q.queue <- &eventsActionReq{
redfishServerIP: redfishServerIP,
slurmNodeName: slurmNodeName,
severity: severity,
action: action,
redfishServerIP: evt.RedfishServerIP,
slurmNodeName: evt.SlurmNodeName,
severity: evt.Severity,
action: evt.Action,
drainReasonPrefix: evt.DrainReasonPrefix,
messageId: evt.MessageId,
message: evt.Message,
excludeStr: evt.ExcludeStr,
scontrolPath: evt.ScontrolPath,
}
}

Expand Down Expand Up @@ -65,19 +89,24 @@ func (q *SlurmQueue) ProcessEventActionQueue() {
}
}

func getDrainReasonString(prefix, msg, msgId, severity string) string {
ret := fmt.Sprintf("%s:redfishlistener:%s:%s:%s", prefix, severity, msgId, msg)
return ret
}

func (q *SlurmQueue) performEventAction(req *eventsActionReq) error {
if len(strings.TrimSpace(req.slurmNodeName)) == 0 {
return nil
}

slurmClient := GetClient()
if slurmClient == nil {
return nil
}

if req.action == Drain {
err := slurmClient.DrainNode(req.slurmNodeName)
reason := getDrainReasonString(req.drainReasonPrefix, req.message, req.messageId, req.severity)
err := DrainNodeWithScontrol(req.slurmNodeName, reason, req.excludeStr, req.scontrolPath)
if err != nil {
if strings.Contains(err.Error(), ExlcudeReasonSet) {
log.Printf("Node not drained: %v", err.Error())
return nil
}
log.Printf("Error draining node: %v", err)
return err
}
Expand Down
Loading

0 comments on commit 3b048a6

Please sign in to comment.