Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the code to use scontrol in place of slurm APIs to drain the #74

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions redfish-exporter/.env
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
UPDATED="2024-09-24"
UPDATED="2025-01-22"
DESCRIPTION="Redfish Event Listener/Exporter"
LISTENER_IP="0.0.0.0"
LISTENER_PORT="8080"
METRICS_PORT="2112"
LISTENER_IP="<Listener_IP>"
LISTENER_PORT="<PORT>"
METRICS_PORT="<MERTRICS_PORT>"
USE_SSL="false"
CERTFILE="path/to/certfile"
KEYFILE="path/to/keyfile"
SLURM_USER="slurm user here"
SLURM_TOKEN="token string here, from secret when for real"
SLURM_CONTROL_NODE="slurm control node IP:Port"
SLURM_CONTROL_NODE="<SLURM_CONTROL_NODE_IP>"
#List of '|' seperated reasons for avoiding drain action if there is a match
SLURM_DRAIN_EXCLUDE_REASON_LIST="reason 1|reason 2"
SLURM_SCONTROL_PATH="/usr/bin/scontrol"

# Match RAS events received based on severity and '|' seperated list of message fields and perform drain action with the DrainReasonPrefix set as the prefix in the reason
# Message can be left empty if it doesn't need to be matched against, in that case only severity is matched
# only DrainNode action is supported for now
TRIGGER_EVENTS="[\
{\"Severity\":\"Fatal\",\"Action\":\"DrainNode\"},\
{\"Severity\":\"Critical\",\"Action\":\"DrainNode\"}
{\"Severity\":\"Critical\",\"Message\":\"message 1|This is a critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is an action necessary if it's just going to be a drain anyway?

Could you also add a comment for these fields as well to make it clear?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left it as is in case we need to use it for some other action in the future. Will update the comments

{\"Severity\":\"Info\",\"Message\":\"message 3\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
{\"Severity\":\"Warning\",\"Message\":\"message 4|This is a test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
]"

# Subscription (v1.5+)
Expand All @@ -28,8 +33,8 @@ TRIGGER_EVENTS="[\

# Deprecated <v1.5
SUBSCRIPTION_PAYLOAD="{\
\"Destination\":\"http://host.docker.internal:8080\",\
\"EventTypes\":[\"Alert\",\"StatusChange\"],\
\"Destination\":\"http://<Listener_IP:Port>\",\
\"EventTypes\":[\"Alert\"],\
\"Protocol\":\"Redfish\",\
\"Context\":\"YourContextData\",\
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
Expand All @@ -41,5 +46,5 @@ PROMETHEUS_CONFIG="{\
}"

REDFISH_SERVERS="[\
{\"ip\":\"http://127.0.0.1:8000\",\"username\":\"Username1\",\"password\":\"Password1\",\"loginType\":\"Session\",\"slurmNode\":\"Node1\"}
]"
{\"ip\":\"https://<BMC_IP>\",\"username\":\"<username>\",\"password\":\"<password>\",\"loginType\":\"Session\",\"slurmNode\":\"<nodename\"}
]"
30 changes: 18 additions & 12 deletions redfish-exporter/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,25 @@ type Config struct {
CertFile string
KeyFile string
}
SlurmToken string
SlurmControlNode string
SlurmUser string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
SlurmToken string
SlurmControlNode string
SlurmUser string
SlurmScontrolPath string
SlurmDrainExcludeStr string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
}

type TriggerEvent struct {
Severity string `json:"Severity"`
Action string `json:"Action"`
Severity string `json:"Severity"`
Action string `json:"Action"`
Message string `json:"Message"`
DrainReasonPrefix string `json:"DrainReasonPrefix"`
}

type PrometheusConfig struct {
Expand Down Expand Up @@ -119,6 +123,8 @@ func setupConfig() Config {
AppConfig.SlurmToken = os.Getenv("SLURM_TOKEN")
AppConfig.SlurmControlNode = os.Getenv("SLURM_CONTROL_NODE")
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")

subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {
Expand Down
24 changes: 22 additions & 2 deletions redfish-exporter/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"log"
"net"
"net/http"
"regexp"
"strings"

"github.com/nod-ai/ADA/redfish-exporter/metrics"
Expand Down Expand Up @@ -219,15 +220,34 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque
log.Printf("Origin Of Condition: %s", originOfCondition)
for _, triggerEvent := range AppConfig.TriggerEvents {
if severity == triggerEvent.Severity {
log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action)
if triggerEvent.Message != "" {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, the action will only be triggered if the severity message matches as well?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the user doesn't want to match on the 'message' field, it can be left empty. I will add it to the comments in .env

re := regexp.MustCompile(triggerEvent.Message)
match := re.FindAllString(message, -1)

if len(match) == 0 {
continue
}
}
log.Printf("Matched Trigger Event: %s | message: %s | with action %s", triggerEvent.Severity, triggerEvent.Message, triggerEvent.Action)
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
if s.slurmQueue != nil {
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action)
break
}
s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action)
evt := slurm.AddEventReq{
RedfishServerIP: redfishServerInfo.IP,
SlurmNodeName: redfishServerInfo.SlurmNode,
Severity: triggerEvent.Severity,
Action: triggerEvent.Action,
DrainReasonPrefix: triggerEvent.DrainReasonPrefix,
MessageId: messageId,
Message: message,
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
ScontrolPath: AppConfig.SlurmScontrolPath,
}
s.slurmQueue.Add(evt)
}
break
}
Expand Down
11 changes: 0 additions & 11 deletions redfish-exporter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,6 @@ func main() {
defer cancel()
var slurmQueue *slurm.SlurmQueue
if *enableSlurm {
if len(strings.TrimSpace(AppConfig.SlurmToken)) == 0 {
log.Fatalf("Provide slurm token to enable slurm")
}
if len(strings.TrimSpace(AppConfig.SlurmControlNode)) == 0 {
log.Fatalf("Provide slurm control node IP:Port to enable slurm")
}
_, err := slurm.NewClient(AppConfig.SlurmControlNode, AppConfig.SlurmUser, AppConfig.SlurmToken)
if err != nil {
log.Fatalf("failed to create slurm client, err: %+v", err)
}

slurmQueue = slurm.InitSlurmQueue(ctx)
go slurmQueue.ProcessEventActionQueue()
}
Expand Down
61 changes: 45 additions & 16 deletions redfish-exporter/slurm/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,40 @@ package slurm

import (
"context"
"fmt"
"log"
"strings"

"github.com/nod-ai/ADA/redfish-exporter/metrics"
)

const (
Drain = "DrainNode"
Drain = "DrainNode"
ExlcudeReasonSet = "DRAIN_EXCLUDE_REASON_SET"
Copy link
Collaborator

@yuva29 yuva29 Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the user provide this? Is this different than the exclude reason from .env?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for internal logging

)

type AddEventReq struct {
RedfishServerIP string
SlurmNodeName string
Severity string
Action string
DrainReasonPrefix string
MessageId string
Message string
ExcludeStr string
ScontrolPath string
}

type eventsActionReq struct {
redfishServerIP string
slurmNodeName string
severity string
action string
redfishServerIP string
slurmNodeName string
severity string
action string
drainReasonPrefix string
messageId string
message string
excludeStr string
scontrolPath string
}

type SlurmQueue struct {
Expand All @@ -28,12 +47,17 @@ func InitSlurmQueue(ctx context.Context) *SlurmQueue {
return &SlurmQueue{ctx: ctx, queue: make(chan *eventsActionReq)}
}

func (q *SlurmQueue) Add(redfishServerIP, slurmNodeName, severity, action string) {
func (q *SlurmQueue) Add(evt AddEventReq) {
q.queue <- &eventsActionReq{
redfishServerIP: redfishServerIP,
slurmNodeName: slurmNodeName,
severity: severity,
action: action,
redfishServerIP: evt.RedfishServerIP,
slurmNodeName: evt.SlurmNodeName,
severity: evt.Severity,
action: evt.Action,
drainReasonPrefix: evt.DrainReasonPrefix,
messageId: evt.MessageId,
message: evt.Message,
excludeStr: evt.ExcludeStr,
scontrolPath: evt.ScontrolPath,
}
}

Expand Down Expand Up @@ -65,19 +89,24 @@ func (q *SlurmQueue) ProcessEventActionQueue() {
}
}

func getDrainReasonString(prefix, msg, msgId, severity string) string {
ret := fmt.Sprintf("%s:redfishlistener:%s:%s:%s", prefix, severity, msgId, msg)
return ret
}

func (q *SlurmQueue) performEventAction(req *eventsActionReq) error {
if len(strings.TrimSpace(req.slurmNodeName)) == 0 {
return nil
}

slurmClient := GetClient()
if slurmClient == nil {
return nil
}

if req.action == Drain {
err := slurmClient.DrainNode(req.slurmNodeName)
reason := getDrainReasonString(req.drainReasonPrefix, req.message, req.messageId, req.severity)
err := DrainNodeWithScontrol(req.slurmNodeName, reason, req.excludeStr, req.scontrolPath)
if err != nil {
if strings.Contains(err.Error(), ExlcudeReasonSet) {
log.Printf("Node not drained: %v", err.Error())
return nil
}
log.Printf("Error draining node: %v", err)
return err
}
Expand Down
Loading