From b014b754870ede3e4415e68bd17386af329f2b1f Mon Sep 17 00:00:00 2001 From: gshaibi Date: Thu, 28 Nov 2024 11:18:28 +0200 Subject: [PATCH] Skeleton --- .../status-exporter/export/fs/exporter.go | 89 ++----------------- .../export/fs/fake/fake_exporter.go | 71 +++++++++++++++ .../status-exporter/export/fs/fake/metrics.go | 54 +++++++++++ .../export/fs/real/real_exporter.go | 89 +++++++++++++++++++ 4 files changed, 222 insertions(+), 81 deletions(-) create mode 100644 internal/status-exporter/export/fs/fake/fake_exporter.go create mode 100644 internal/status-exporter/export/fs/fake/metrics.go create mode 100644 internal/status-exporter/export/fs/real/real_exporter.go diff --git a/internal/status-exporter/export/fs/exporter.go b/internal/status-exporter/export/fs/exporter.go index d2bcb01..dd20ca7 100644 --- a/internal/status-exporter/export/fs/exporter.go +++ b/internal/status-exporter/export/fs/exporter.go @@ -1,96 +1,23 @@ package fs import ( - "fmt" - "log" - "os" - "path/filepath" - "strconv" - "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" "github.com/run-ai/fake-gpu-operator/internal/status-exporter/export" + "github.com/run-ai/fake-gpu-operator/internal/status-exporter/export/fs/fake" + "github.com/run-ai/fake-gpu-operator/internal/status-exporter/export/fs/real" "github.com/run-ai/fake-gpu-operator/internal/status-exporter/watch" -) - -type FsExporter struct { - topologyChan <-chan *topology.NodeTopology -} -var _ export.Interface = &FsExporter{} + "github.com/spf13/viper" +) -func NewFsExporter(watcher watch.Interface) *FsExporter { +func NewFsExporter(watcher watch.Interface) export.Interface { topologyChan := make(chan *topology.NodeTopology) watcher.Subscribe(topologyChan) - return &FsExporter{ - topologyChan: topologyChan, - } -} - -func (e *FsExporter) Run(stopCh <-chan struct{}) { - for { - select { - case nodeTopology := <-e.topologyChan: - e.export(nodeTopology) - case <-stopCh: - return - } - } -} - -func (e *FsExporter) export(nodeTopology *topology.NodeTopology) { - exportPods(nodeTopology) - exportEvents() -} - -func exportPods(nodeTopology *topology.NodeTopology) { - podProcDir := "/runai/proc/pod" - if err := os.RemoveAll(podProcDir); err != nil { - log.Printf("Failed deleting %s directory: %s", podProcDir, err.Error()) + if viper.GetBool(constants.EnvFakeNode) { + return fake.NewFakeFsExporter(topologyChan) } - for gpuIdx, gpu := range nodeTopology.Gpus { - // Ignoring pods that are not supposed to be seen by runai-container-toolkit - if gpu.Status.AllocatedBy.Namespace != constants.ReservationNs { - continue - } - - for podUuid, gpuUsageStatus := range gpu.Status.PodGpuUsageStatus { - log.Printf("Exporting pod %s gpu stats to filesystem", podUuid) - - path := fmt.Sprintf("%s/%s/metrics/gpu/%d", podProcDir, podUuid, gpuIdx) - if err := os.MkdirAll(path, 0755); err != nil { - log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error()) - } - - if err := writeFile(filepath.Join(path, "utilization.sm"), []byte(strconv.Itoa(gpuUsageStatus.Utilization.Random()))); err != nil { - log.Printf("Failed exporting utilization for pod %s: %s", podUuid, err.Error()) - } - - if err := writeFile(filepath.Join(path, "memory.allocated"), []byte(strconv.Itoa(mbToBytes(gpuUsageStatus.FbUsed)))); err != nil { - log.Printf("Failed exporting memory for pod %s: %s", podUuid, err.Error()) - } - } - } -} - -func exportEvents() { - // For now, only creating the directory without exporting any events. - // In the future, we might want to export events to the filesystem as well. - eventsDir := "/runai/proc/events" - if err := os.MkdirAll(eventsDir, 0755); err != nil { - log.Printf("Failed creating directory for events: %s", err.Error()) - } -} - -func writeFile(path string, content []byte) error { - if err := os.WriteFile(path, content, 0644); err != nil { - return fmt.Errorf("failed writing file %s: %w", path, err) - } - return nil -} - -func mbToBytes(mb int) int { - return mb * (1000 * 1000) + return real.NewRealFsExporter(topologyChan) } diff --git a/internal/status-exporter/export/fs/fake/fake_exporter.go b/internal/status-exporter/export/fs/fake/fake_exporter.go new file mode 100644 index 0000000..cd545ab --- /dev/null +++ b/internal/status-exporter/export/fs/fake/fake_exporter.go @@ -0,0 +1,71 @@ +package fake + +import ( + "github.com/run-ai/fake-gpu-operator/internal/common/topology" +) + +// FakeFsExporter exports fake filesystem based prometheus metrics. +type FakeFsExporter struct { + topologyChan <-chan *topology.NodeTopology +} + +func NewFakeFsExporter(topologyChan <-chan *topology.NodeTopology) *FakeFsExporter { + return &FakeFsExporter{ + topologyChan: topologyChan, + } +} + +func (e *FakeFsExporter) Run(stopCh <-chan struct{}) { + for { + select { + case nodeTopology := <-e.topologyChan: + e.export(nodeTopology) + case <-stopCh: + return + } + } +} + +func (e *FakeFsExporter) export(nodeTopology *topology.NodeTopology) { + exportFsBasedMetrics(nodeTopology) +} + +func exportFsBasedMetrics(nodeTopology *topology.NodeTopology) { + // Export the following: + // core_team_metric.NewCoreTeamMetric( + // "runai_pod_gpu_utilization", + // "GPU Utilization of Pod", + // coreTeamMetricsDir, + // "pod/{pod_uuid}/metrics/gpu/{gpu}/utilization.sm"), + // core_team_metric.NewCoreTeamMetric( + // "runai_pod_gpu_memory_used_bytes", + // "GPU Memory Usage of Pod in Bytes", + // coreTeamMetricsDir, + // "pod/{pod_uuid}/metrics/gpu/{gpu}/memory.allocated"), + // core_team_metric.NewCoreTeamMetric( + // "runai_pod_gpu_swap_ram_used_bytes", + // "GPU Swap Ram Memory Usage of Pod in Bytes", + // coreTeamMetricsDir, + // "pod/{pod_uuid}/metrics/gpu/{gpu}/memory.swap_ram_used"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_burst_count", + // "GPU Burst OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.burst"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_idle_count", + // "GPU Idle OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.idle"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_priority_count", + // "GPU Priority OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.priority"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_swap_out_of_ram_count", + // "GPU swap out of RAM OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.swap_out_of_ram"), + +} diff --git a/internal/status-exporter/export/fs/fake/metrics.go b/internal/status-exporter/export/fs/fake/metrics.go new file mode 100644 index 0000000..2a80ace --- /dev/null +++ b/internal/status-exporter/export/fs/fake/metrics.go @@ -0,0 +1,54 @@ +package fake + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + // core_team_metric.NewCoreTeamMetric( + // "runai_pod_gpu_utilization", + // "GPU Utilization of Pod", + // coreTeamMetricsDir, + // "pod/{pod_uuid}/metrics/gpu/{gpu}/utilization.sm"), + // core_team_metric.NewCoreTeamMetric( + // "runai_pod_gpu_memory_used_bytes", + // "GPU Memory Usage of Pod in Bytes", + // coreTeamMetricsDir, + // "pod/{pod_uuid}/metrics/gpu/{gpu}/memory.allocated"), + // core_team_metric.NewCoreTeamMetric( + // "runai_pod_gpu_swap_ram_used_bytes", + // "GPU Swap Ram Memory Usage of Pod in Bytes", + // coreTeamMetricsDir, + // "pod/{pod_uuid}/metrics/gpu/{gpu}/memory.swap_ram_used"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_burst_count", + // "GPU Burst OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.burst"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_idle_count", + // "GPU Idle OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.idle"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_priority_count", + // "GPU Priority OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.priority"), + // core_team_metric.NewCoreTeamMetric( + // "runai_gpu_oomkill_swap_out_of_ram_count", + // "GPU swap out of RAM OOMKill count", + // coreTeamMetricsDir, + // "metrics/gpu/{gpu}/oom.swap_out_of_ram"), + + runaiPodGpuUtil = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "runai_pod_gpu_utilization", + Help: "GPU Utilization of Pod", + }, []string{"pod_uuid", "gpu"}) + + runaiPodGpuMemoryUsedBytes = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "runai_pod_gpu_memory_used_bytes", + Help: "GPU Memory Usage of Pod in Bytes", + +) diff --git a/internal/status-exporter/export/fs/real/real_exporter.go b/internal/status-exporter/export/fs/real/real_exporter.go new file mode 100644 index 0000000..3aa617a --- /dev/null +++ b/internal/status-exporter/export/fs/real/real_exporter.go @@ -0,0 +1,89 @@ +package real + +import ( + "fmt" + "log" + "os" + "path/filepath" + "strconv" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" +) + +type RealFsExporter struct { + topologyChan <-chan *topology.NodeTopology +} + +func NewRealFsExporter(topologyChan <-chan *topology.NodeTopology) *RealFsExporter { + return &RealFsExporter{ + topologyChan: topologyChan, + } +} + +func (e *RealFsExporter) Run(stopCh <-chan struct{}) { + for { + select { + case nodeTopology := <-e.topologyChan: + e.export(nodeTopology) + case <-stopCh: + return + } + } +} + +func (e *RealFsExporter) export(nodeTopology *topology.NodeTopology) { + exportPods(nodeTopology) + exportEvents() +} + +func exportPods(nodeTopology *topology.NodeTopology) { + podProcDir := "/runai/proc/pod" + if err := os.RemoveAll(podProcDir); err != nil { + log.Printf("Failed deleting %s directory: %s", podProcDir, err.Error()) + } + + for gpuIdx, gpu := range nodeTopology.Gpus { + // Ignoring pods that are not supposed to be seen by runai-container-toolkit + if gpu.Status.AllocatedBy.Namespace != constants.ReservationNs { + continue + } + + for podUuid, gpuUsageStatus := range gpu.Status.PodGpuUsageStatus { + log.Printf("Exporting pod %s gpu stats to filesystem", podUuid) + + path := fmt.Sprintf("%s/%s/metrics/gpu/%d", podProcDir, podUuid, gpuIdx) + if err := os.MkdirAll(path, 0755); err != nil { + log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error()) + } + + if err := writeFile(filepath.Join(path, "utilization.sm"), []byte(strconv.Itoa(gpuUsageStatus.Utilization.Random()))); err != nil { + log.Printf("Failed exporting utilization for pod %s: %s", podUuid, err.Error()) + } + + if err := writeFile(filepath.Join(path, "memory.allocated"), []byte(strconv.Itoa(mbToBytes(gpuUsageStatus.FbUsed)))); err != nil { + log.Printf("Failed exporting memory for pod %s: %s", podUuid, err.Error()) + } + } + } +} + +func exportEvents() { + // For now, only creating the directory without exporting any events. + // In the future, we might want to export events to the filesystem as well. + eventsDir := "/runai/proc/events" + if err := os.MkdirAll(eventsDir, 0755); err != nil { + log.Printf("Failed creating directory for events: %s", err.Error()) + } +} + +func writeFile(path string, content []byte) error { + if err := os.WriteFile(path, content, 0644); err != nil { + return fmt.Errorf("failed writing file %s: %w", path, err) + } + return nil +} + +func mbToBytes(mb int) int { + return mb * (1000 * 1000) +}