From b014b754870ede3e4415e68bd17386af329f2b1f Mon Sep 17 00:00:00 2001
From: gshaibi <gshaibi@gmail.com>
Date: Thu, 28 Nov 2024 11:18:28 +0200
Subject: [PATCH] Skeleton

---
 .../status-exporter/export/fs/exporter.go     | 89 ++-----------------
 .../export/fs/fake/fake_exporter.go           | 71 +++++++++++++++
 .../status-exporter/export/fs/fake/metrics.go | 54 +++++++++++
 .../export/fs/real/real_exporter.go           | 89 +++++++++++++++++++
 4 files changed, 222 insertions(+), 81 deletions(-)
 create mode 100644 internal/status-exporter/export/fs/fake/fake_exporter.go
 create mode 100644 internal/status-exporter/export/fs/fake/metrics.go
 create mode 100644 internal/status-exporter/export/fs/real/real_exporter.go

diff --git a/internal/status-exporter/export/fs/exporter.go b/internal/status-exporter/export/fs/exporter.go
index d2bcb01..dd20ca7 100644
--- a/internal/status-exporter/export/fs/exporter.go
+++ b/internal/status-exporter/export/fs/exporter.go
@@ -1,96 +1,23 @@
 package fs
 
 import (
-	"fmt"
-	"log"
-	"os"
-	"path/filepath"
-	"strconv"
-
 	"github.com/run-ai/fake-gpu-operator/internal/common/constants"
 	"github.com/run-ai/fake-gpu-operator/internal/common/topology"
 	"github.com/run-ai/fake-gpu-operator/internal/status-exporter/export"
+	"github.com/run-ai/fake-gpu-operator/internal/status-exporter/export/fs/fake"
+	"github.com/run-ai/fake-gpu-operator/internal/status-exporter/export/fs/real"
 	"github.com/run-ai/fake-gpu-operator/internal/status-exporter/watch"
-)
-
-type FsExporter struct {
-	topologyChan <-chan *topology.NodeTopology
-}
 
-var _ export.Interface = &FsExporter{}
+	"github.com/spf13/viper"
+)
 
-func NewFsExporter(watcher watch.Interface) *FsExporter {
+func NewFsExporter(watcher watch.Interface) export.Interface {
 	topologyChan := make(chan *topology.NodeTopology)
 	watcher.Subscribe(topologyChan)
 
-	return &FsExporter{
-		topologyChan: topologyChan,
-	}
-}
-
-func (e *FsExporter) Run(stopCh <-chan struct{}) {
-	for {
-		select {
-		case nodeTopology := <-e.topologyChan:
-			e.export(nodeTopology)
-		case <-stopCh:
-			return
-		}
-	}
-}
-
-func (e *FsExporter) export(nodeTopology *topology.NodeTopology) {
-	exportPods(nodeTopology)
-	exportEvents()
-}
-
-func exportPods(nodeTopology *topology.NodeTopology) {
-	podProcDir := "/runai/proc/pod"
-	if err := os.RemoveAll(podProcDir); err != nil {
-		log.Printf("Failed deleting %s directory: %s", podProcDir, err.Error())
+	if viper.GetBool(constants.EnvFakeNode) {
+		return fake.NewFakeFsExporter(topologyChan)
 	}
 
-	for gpuIdx, gpu := range nodeTopology.Gpus {
-		// Ignoring pods that are not supposed to be seen by runai-container-toolkit
-		if gpu.Status.AllocatedBy.Namespace != constants.ReservationNs {
-			continue
-		}
-
-		for podUuid, gpuUsageStatus := range gpu.Status.PodGpuUsageStatus {
-			log.Printf("Exporting pod %s gpu stats to filesystem", podUuid)
-
-			path := fmt.Sprintf("%s/%s/metrics/gpu/%d", podProcDir, podUuid, gpuIdx)
-			if err := os.MkdirAll(path, 0755); err != nil {
-				log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error())
-			}
-
-			if err := writeFile(filepath.Join(path, "utilization.sm"), []byte(strconv.Itoa(gpuUsageStatus.Utilization.Random()))); err != nil {
-				log.Printf("Failed exporting utilization for pod %s: %s", podUuid, err.Error())
-			}
-
-			if err := writeFile(filepath.Join(path, "memory.allocated"), []byte(strconv.Itoa(mbToBytes(gpuUsageStatus.FbUsed)))); err != nil {
-				log.Printf("Failed exporting memory for pod %s: %s", podUuid, err.Error())
-			}
-		}
-	}
-}
-
-func exportEvents() {
-	// For now, only creating the directory without exporting any events.
-	// In the future, we might want to export events to the filesystem as well.
-	eventsDir := "/runai/proc/events"
-	if err := os.MkdirAll(eventsDir, 0755); err != nil {
-		log.Printf("Failed creating directory for events: %s", err.Error())
-	}
-}
-
-func writeFile(path string, content []byte) error {
-	if err := os.WriteFile(path, content, 0644); err != nil {
-		return fmt.Errorf("failed writing file %s: %w", path, err)
-	}
-	return nil
-}
-
-func mbToBytes(mb int) int {
-	return mb * (1000 * 1000)
+	return real.NewRealFsExporter(topologyChan)
 }
diff --git a/internal/status-exporter/export/fs/fake/fake_exporter.go b/internal/status-exporter/export/fs/fake/fake_exporter.go
new file mode 100644
index 0000000..cd545ab
--- /dev/null
+++ b/internal/status-exporter/export/fs/fake/fake_exporter.go
@@ -0,0 +1,71 @@
+package fake
+
+import (
+	"github.com/run-ai/fake-gpu-operator/internal/common/topology"
+)
+
+// FakeFsExporter exports fake filesystem based prometheus metrics.
+type FakeFsExporter struct {
+	topologyChan <-chan *topology.NodeTopology
+}
+
+func NewFakeFsExporter(topologyChan <-chan *topology.NodeTopology) *FakeFsExporter {
+	return &FakeFsExporter{
+		topologyChan: topologyChan,
+	}
+}
+
+func (e *FakeFsExporter) Run(stopCh <-chan struct{}) {
+	for {
+		select {
+		case nodeTopology := <-e.topologyChan:
+			e.export(nodeTopology)
+		case <-stopCh:
+			return
+		}
+	}
+}
+
+func (e *FakeFsExporter) export(nodeTopology *topology.NodeTopology) {
+	exportFsBasedMetrics(nodeTopology)
+}
+
+func exportFsBasedMetrics(nodeTopology *topology.NodeTopology) {
+	// Export the following:
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_pod_gpu_utilization",
+	// 	"GPU Utilization of Pod",
+	// 	coreTeamMetricsDir,
+	// 	"pod/{pod_uuid}/metrics/gpu/{gpu}/utilization.sm"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_pod_gpu_memory_used_bytes",
+	// 	"GPU Memory Usage of Pod in Bytes",
+	// 	coreTeamMetricsDir,
+	// 	"pod/{pod_uuid}/metrics/gpu/{gpu}/memory.allocated"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_pod_gpu_swap_ram_used_bytes",
+	// 	"GPU Swap Ram Memory Usage of Pod in Bytes",
+	// 	coreTeamMetricsDir,
+	// 	"pod/{pod_uuid}/metrics/gpu/{gpu}/memory.swap_ram_used"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_burst_count",
+	// 	"GPU Burst OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.burst"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_idle_count",
+	// 	"GPU Idle OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.idle"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_priority_count",
+	// 	"GPU Priority OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.priority"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_swap_out_of_ram_count",
+	// 	"GPU swap out of RAM OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.swap_out_of_ram"),
+
+}
diff --git a/internal/status-exporter/export/fs/fake/metrics.go b/internal/status-exporter/export/fs/fake/metrics.go
new file mode 100644
index 0000000..2a80ace
--- /dev/null
+++ b/internal/status-exporter/export/fs/fake/metrics.go
@@ -0,0 +1,54 @@
+package fake
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+)
+
+var (
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_pod_gpu_utilization",
+	// 	"GPU Utilization of Pod",
+	// 	coreTeamMetricsDir,
+	// 	"pod/{pod_uuid}/metrics/gpu/{gpu}/utilization.sm"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_pod_gpu_memory_used_bytes",
+	// 	"GPU Memory Usage of Pod in Bytes",
+	// 	coreTeamMetricsDir,
+	// 	"pod/{pod_uuid}/metrics/gpu/{gpu}/memory.allocated"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_pod_gpu_swap_ram_used_bytes",
+	// 	"GPU Swap Ram Memory Usage of Pod in Bytes",
+	// 	coreTeamMetricsDir,
+	// 	"pod/{pod_uuid}/metrics/gpu/{gpu}/memory.swap_ram_used"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_burst_count",
+	// 	"GPU Burst OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.burst"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_idle_count",
+	// 	"GPU Idle OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.idle"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_priority_count",
+	// 	"GPU Priority OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.priority"),
+	// core_team_metric.NewCoreTeamMetric(
+	// 	"runai_gpu_oomkill_swap_out_of_ram_count",
+	// 	"GPU swap out of RAM OOMKill count",
+	// 	coreTeamMetricsDir,
+	// 	"metrics/gpu/{gpu}/oom.swap_out_of_ram"),
+
+	runaiPodGpuUtil = promauto.NewGaugeVec(prometheus.GaugeOpts{
+		Name: "runai_pod_gpu_utilization",
+		Help: "GPU Utilization of Pod",
+	}, []string{"pod_uuid", "gpu"})
+
+	runaiPodGpuMemoryUsedBytes = promauto.NewGaugeVec(prometheus.GaugeOpts{
+		Name: "runai_pod_gpu_memory_used_bytes",
+		Help: "GPU Memory Usage of Pod in Bytes",
+		
+)
diff --git a/internal/status-exporter/export/fs/real/real_exporter.go b/internal/status-exporter/export/fs/real/real_exporter.go
new file mode 100644
index 0000000..3aa617a
--- /dev/null
+++ b/internal/status-exporter/export/fs/real/real_exporter.go
@@ -0,0 +1,89 @@
+package real
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"path/filepath"
+	"strconv"
+
+	"github.com/run-ai/fake-gpu-operator/internal/common/constants"
+	"github.com/run-ai/fake-gpu-operator/internal/common/topology"
+)
+
+type RealFsExporter struct {
+	topologyChan <-chan *topology.NodeTopology
+}
+
+func NewRealFsExporter(topologyChan <-chan *topology.NodeTopology) *RealFsExporter {
+	return &RealFsExporter{
+		topologyChan: topologyChan,
+	}
+}
+
+func (e *RealFsExporter) Run(stopCh <-chan struct{}) {
+	for {
+		select {
+		case nodeTopology := <-e.topologyChan:
+			e.export(nodeTopology)
+		case <-stopCh:
+			return
+		}
+	}
+}
+
+func (e *RealFsExporter) export(nodeTopology *topology.NodeTopology) {
+	exportPods(nodeTopology)
+	exportEvents()
+}
+
+func exportPods(nodeTopology *topology.NodeTopology) {
+	podProcDir := "/runai/proc/pod"
+	if err := os.RemoveAll(podProcDir); err != nil {
+		log.Printf("Failed deleting %s directory: %s", podProcDir, err.Error())
+	}
+
+	for gpuIdx, gpu := range nodeTopology.Gpus {
+		// Ignoring pods that are not supposed to be seen by runai-container-toolkit
+		if gpu.Status.AllocatedBy.Namespace != constants.ReservationNs {
+			continue
+		}
+
+		for podUuid, gpuUsageStatus := range gpu.Status.PodGpuUsageStatus {
+			log.Printf("Exporting pod %s gpu stats to filesystem", podUuid)
+
+			path := fmt.Sprintf("%s/%s/metrics/gpu/%d", podProcDir, podUuid, gpuIdx)
+			if err := os.MkdirAll(path, 0755); err != nil {
+				log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error())
+			}
+
+			if err := writeFile(filepath.Join(path, "utilization.sm"), []byte(strconv.Itoa(gpuUsageStatus.Utilization.Random()))); err != nil {
+				log.Printf("Failed exporting utilization for pod %s: %s", podUuid, err.Error())
+			}
+
+			if err := writeFile(filepath.Join(path, "memory.allocated"), []byte(strconv.Itoa(mbToBytes(gpuUsageStatus.FbUsed)))); err != nil {
+				log.Printf("Failed exporting memory for pod %s: %s", podUuid, err.Error())
+			}
+		}
+	}
+}
+
+func exportEvents() {
+	// For now, only creating the directory without exporting any events.
+	// In the future, we might want to export events to the filesystem as well.
+	eventsDir := "/runai/proc/events"
+	if err := os.MkdirAll(eventsDir, 0755); err != nil {
+		log.Printf("Failed creating directory for events: %s", err.Error())
+	}
+}
+
+func writeFile(path string, content []byte) error {
+	if err := os.WriteFile(path, content, 0644); err != nil {
+		return fmt.Errorf("failed writing file %s: %w", path, err)
+	}
+	return nil
+}
+
+func mbToBytes(mb int) int {
+	return mb * (1000 * 1000)
+}