Skip to content

Commit

Permalink
fix: Improve logging verbosity in vGPU monitor for better debugging
Browse files Browse the repository at this point in the history
  • Loading branch information
haitwang-cloud committed Feb 7, 2025
1 parent 55077e8 commit 50ee5bd
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 15 deletions.
16 changes: 8 additions & 8 deletions cmd/vGPUmonitor/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,27 +232,27 @@ func Observe(lister *nvidia.ContainerLister) {
utilizationSwitch := c.Info.GetUtilizationSwitch()
if CheckBlocking(utSwitchOn, priority, c) {
if recentKernel >= 0 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting Blocking to on %v", idx)
klog.V(5).Infof("utSwitchon=%v", utSwitchOn)
klog.V(5).Infof("Setting Blocking to on %v", idx)
c.Info.SetRecentKernel(-1)
}
} else {
if recentKernel < 0 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting Blocking to off %v", idx)
klog.V(5).Infof("utSwitchon=%v", utSwitchOn)
klog.V(5).Infof("Setting Blocking to off %v", idx)
c.Info.SetRecentKernel(0)
}
}
if CheckPriority(utSwitchOn, priority, c) {
if utilizationSwitch != 1 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting UtilizationSwitch to on %v", idx)
klog.V(5).Infof("utSwitchon=%v", utSwitchOn)
klog.V(5).Infof("Setting UtilizationSwitch to on %v", idx)
c.Info.SetUtilizationSwitch(1)
}
} else {
if utilizationSwitch != 0 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting UtilizationSwitch to off %v", idx)
klog.V(5).Infof("utSwitchon=%v", utSwitchOn)
klog.V(5).Infof("Setting UtilizationSwitch to off %v", idx)
c.Info.SetUtilizationSwitch(0)
}
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/vGPUmonitor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func start() {
}

func initMetrics(ctx context.Context, containerLister *nvidia.ContainerLister) error {
klog.Info("Initializing metrics for vGPUmonitor")
klog.V(4).Info("Initializing metrics for vGPUmonitor")
reg := prometheus.NewRegistry()
//reg := prometheus.NewPedanticRegistry()

Expand All @@ -137,7 +137,7 @@ func initMetrics(ctx context.Context, containerLister *nvidia.ContainerLister) e

// Graceful shutdown on context cancellation
<-ctx.Done()
klog.Info("Shutting down metrics server")
klog.V(4).Info("Shutting down metrics server")
if err := server.Shutdown(context.Background()); err != nil {
return err
}
Expand Down
10 changes: 5 additions & 5 deletions cmd/vGPUmonitor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,18 +313,18 @@ func (cc ClusterManagerCollector) collectPodAndContainerInfo(ch chan<- prometheu
for _, pod := range pods {
podContainers, found := containerMap[string(pod.UID)]
if !found {
klog.V(4).Infof("No containers found for pod %s/%s", pod.Namespace, pod.Name)
klog.V(5).Infof("No containers found for pod %s/%s", pod.Namespace, pod.Name)
continue
}

klog.V(2).Infof("Processing Pod %s/%s", pod.Namespace, pod.Name)
klog.V(5).Infof("Processing Pod %s/%s", pod.Namespace, pod.Name)

// Iterate through each container in the Pod
for _, ctr := range pod.Spec.Containers {
// Find the matching container
for _, c := range podContainers {
if c.ContainerName == ctr.Name {
klog.V(2).Infof("Processing Container %s in Pod %s/%s", ctr.Name, pod.Namespace, pod.Name)
klog.V(5).Infof("Processing Container %s in Pod %s/%s", ctr.Name, pod.Namespace, pod.Name)
if err := cc.collectContainerMetrics(ch, pod, ctr, c, nowSec); err != nil {
klog.Errorf("Failed to collect metrics for container %s in Pod %s/%s: %v", ctr.Name, pod.Namespace, pod.Name, err)
}
Expand All @@ -334,7 +334,7 @@ func (cc ClusterManagerCollector) collectPodAndContainerInfo(ch chan<- prometheu
}
}

klog.V(2).Infof("Finished collecting metrics for %d pods", len(pods))
klog.V(4).Infof("Finished collecting metrics for %d pods", len(pods))
return nil
}

Expand Down Expand Up @@ -409,7 +409,7 @@ func (cc ClusterManagerCollector) collectContainerMetrics(ch chan<- prometheus.M
}
}

klog.V(2).Infof("Successfully collected metrics for Pod %s/%s, Container %s", pod.Namespace, pod.Name, ctr.Name)
klog.V(5).Infof("Successfully collected metrics for Pod %s/%s, Container %s", pod.Namespace, pod.Name, ctr.Name)
return nil
}

Expand Down

0 comments on commit 50ee5bd

Please sign in to comment.