-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update the code to use scontrol in place of slurm APIs to drain the #74
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ import ( | |
"log" | ||
"net" | ||
"net/http" | ||
"regexp" | ||
"strings" | ||
|
||
"github.com/nod-ai/ADA/redfish-exporter/metrics" | ||
|
@@ -219,15 +220,34 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque | |
log.Printf("Origin Of Condition: %s", originOfCondition) | ||
for _, triggerEvent := range AppConfig.TriggerEvents { | ||
if severity == triggerEvent.Severity { | ||
log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action) | ||
if triggerEvent.Message != "" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, the action will only be triggered if the severity message matches as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the user doesn't want to match on the 'message' field, it can be left empty. I will add it to the comments in .env |
||
re := regexp.MustCompile(triggerEvent.Message) | ||
match := re.FindAllString(message, -1) | ||
|
||
if len(match) == 0 { | ||
continue | ||
} | ||
} | ||
log.Printf("Matched Trigger Event: %s | message: %s | with action %s", triggerEvent.Severity, triggerEvent.Message, triggerEvent.Action) | ||
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map. | ||
if s.slurmQueue != nil { | ||
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip) | ||
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 { | ||
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action) | ||
break | ||
} | ||
s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action) | ||
evt := slurm.AddEventReq{ | ||
RedfishServerIP: redfishServerInfo.IP, | ||
SlurmNodeName: redfishServerInfo.SlurmNode, | ||
Severity: triggerEvent.Severity, | ||
Action: triggerEvent.Action, | ||
DrainReasonPrefix: triggerEvent.DrainReasonPrefix, | ||
MessageId: messageId, | ||
Message: message, | ||
ExcludeStr: AppConfig.SlurmDrainExcludeStr, | ||
ScontrolPath: AppConfig.SlurmScontrolPath, | ||
} | ||
s.slurmQueue.Add(evt) | ||
} | ||
break | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,21 +2,40 @@ package slurm | |
|
||
import ( | ||
"context" | ||
"fmt" | ||
"log" | ||
"strings" | ||
|
||
"github.com/nod-ai/ADA/redfish-exporter/metrics" | ||
) | ||
|
||
const ( | ||
Drain = "DrainNode" | ||
Drain = "DrainNode" | ||
ExlcudeReasonSet = "DRAIN_EXCLUDE_REASON_SET" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the user provide this? Is this different than the exclude reason from .env? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for internal logging |
||
) | ||
|
||
type AddEventReq struct { | ||
RedfishServerIP string | ||
SlurmNodeName string | ||
Severity string | ||
Action string | ||
DrainReasonPrefix string | ||
MessageId string | ||
Message string | ||
ExcludeStr string | ||
ScontrolPath string | ||
} | ||
|
||
type eventsActionReq struct { | ||
redfishServerIP string | ||
slurmNodeName string | ||
severity string | ||
action string | ||
redfishServerIP string | ||
slurmNodeName string | ||
severity string | ||
action string | ||
drainReasonPrefix string | ||
messageId string | ||
message string | ||
excludeStr string | ||
scontrolPath string | ||
} | ||
|
||
type SlurmQueue struct { | ||
|
@@ -28,12 +47,17 @@ func InitSlurmQueue(ctx context.Context) *SlurmQueue { | |
return &SlurmQueue{ctx: ctx, queue: make(chan *eventsActionReq)} | ||
} | ||
|
||
func (q *SlurmQueue) Add(redfishServerIP, slurmNodeName, severity, action string) { | ||
func (q *SlurmQueue) Add(evt AddEventReq) { | ||
q.queue <- &eventsActionReq{ | ||
redfishServerIP: redfishServerIP, | ||
slurmNodeName: slurmNodeName, | ||
severity: severity, | ||
action: action, | ||
redfishServerIP: evt.RedfishServerIP, | ||
slurmNodeName: evt.SlurmNodeName, | ||
severity: evt.Severity, | ||
action: evt.Action, | ||
drainReasonPrefix: evt.DrainReasonPrefix, | ||
messageId: evt.MessageId, | ||
message: evt.Message, | ||
excludeStr: evt.ExcludeStr, | ||
scontrolPath: evt.ScontrolPath, | ||
} | ||
} | ||
|
||
|
@@ -65,19 +89,24 @@ func (q *SlurmQueue) ProcessEventActionQueue() { | |
} | ||
} | ||
|
||
func getDrainReasonString(prefix, msg, msgId, severity string) string { | ||
ret := fmt.Sprintf("%s:redfishlistener:%s:%s:%s", prefix, severity, msgId, msg) | ||
return ret | ||
} | ||
|
||
func (q *SlurmQueue) performEventAction(req *eventsActionReq) error { | ||
if len(strings.TrimSpace(req.slurmNodeName)) == 0 { | ||
return nil | ||
} | ||
|
||
slurmClient := GetClient() | ||
if slurmClient == nil { | ||
return nil | ||
} | ||
|
||
if req.action == Drain { | ||
err := slurmClient.DrainNode(req.slurmNodeName) | ||
reason := getDrainReasonString(req.drainReasonPrefix, req.message, req.messageId, req.severity) | ||
err := DrainNodeWithScontrol(req.slurmNodeName, reason, req.excludeStr, req.scontrolPath) | ||
if err != nil { | ||
if strings.Contains(err.Error(), ExlcudeReasonSet) { | ||
log.Printf("Node not drained: %v", err.Error()) | ||
return nil | ||
} | ||
log.Printf("Error draining node: %v", err) | ||
return err | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is an action necessary if it's just going to be a drain anyway?
Could you also add a comment for these fields as well to make it clear?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I left it as is in case we need to use it for some other action in the future. Will update the comments