diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..9ae060f4fb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +FROM quay.io/sustainable_computing_io/kepler_builder:ubi-9-libbpf-1.2.0 as builder + +WORKDIR /workspace + +COPY . . + +RUN ATTACHER_TAG=libbpf make build + +FROM registry.access.redhat.com/ubi9-minimal:9.2 +RUN microdnf -y update + +ENV NVIDIA_VISIBLE_DEVICES=all + +RUN INSTALL_PKGS=" \ + libbpf \ + " && \ + microdnf install -y $INSTALL_PKGS && \ + microdnf clean all + +COPY --from=builder /workspace/_output/bin/kepler /usr/bin/kepler +COPY --from=builder /libbpf-source/linux-5.14.0-333.el9/tools/bpf/bpftool/bpftool /usr/bin/bpftool +COPY --from=builder /usr/bin/cpuid /usr/bin/cpuid + +RUN mkdir -p /var/lib/kepler/data +RUN mkdir -p /var/lib/kepler/bpfassets +COPY --from=builder /workspace/data/cpus.yaml /var/lib/kepler/data/cpus.yaml +COPY --from=builder /workspace/bpfassets/libbpf/bpf.o /var/lib/kepler/bpfassets + +# copy model weight +COPY --from=builder /workspace/data/model_weight/acpi_AbsPowerModel.json /var/lib/kepler/data/acpi_AbsPowerModel.json +COPY --from=builder /workspace/data/model_weight/acpi_DynPowerModel.json /var/lib/kepler/data/acpi_DynPowerModel.json +COPY --from=builder /workspace/data/model_weight/rapl_AbsPowerModel.json /var/lib/kepler/data/rapl_AbsPowerModel.json +COPY --from=builder /workspace/data/model_weight/rapl_DynPowerModel.json /var/lib/kepler/data/rapl_DynPowerModel.json +ENTRYPOINT ["/usr/bin/kepler"] diff --git a/go.mod b/go.mod index 31b382e0bd..fa664f2683 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/sustainable-computing-io/kepler -go 1.18 +go 1.20 require ( github.com/NVIDIA/go-nvml v0.11.6-0 diff --git a/pkg/collector/metric/node_metric.go b/pkg/collector/metric/node_metric.go index 6b2136773f..f7628bf8b5 100644 --- a/pkg/collector/metric/node_metric.go +++ b/pkg/collector/metric/node_metric.go @@ -39,6 +39,8 @@ const ( OTHER = "other" PLATFORM = "platform" FREQUENCY = "frequency" + DYN = "_DYN" + IDLE = "_IDLE" ) var ( @@ -228,46 +230,46 @@ func (ne *NodeMetrics) ToEstimatorValues(featuresName []string, shouldNormalize case config.GpuUsageMetric: // for GPU resource usage featureValues = append(featureValues, normalize(ne.ResourceUsage[config.GpuUsageMetric], shouldNormalize)) - case PKG + "_DYN": // for dynamic PKG power consumption + case PKG + DYN: // for dynamic PKG power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaDynEnergyFromAllSources(PKG)), shouldNormalize)) - case CORE + "_DYN": // for dynamic CORE power consumption + case CORE + DYN: // for dynamic CORE power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaDynEnergyFromAllSources(CORE)), shouldNormalize)) - case DRAM + "_DYN": // for dynamic DRAM power consumption + case DRAM + DYN: // for dynamic DRAM power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaDynEnergyFromAllSources(DRAM)), shouldNormalize)) - case UNCORE + "_DYN": // for dynamic UNCORE power consumption + case UNCORE + DYN: // for dynamic UNCORE power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaDynEnergyFromAllSources(UNCORE)), shouldNormalize)) - case OTHER + "_DYN": // for dynamic OTHER power consumption + case OTHER + DYN: // for dynamic OTHER power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaDynEnergyFromAllSources(OTHER)), shouldNormalize)) - case PLATFORM + "_DYN": // for dynamic PLATFORM power consumption + case PLATFORM + DYN: // for dynamic PLATFORM power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaDynEnergyFromAllSources(PLATFORM)), shouldNormalize)) - case GPU + "_DYN": // for dynamic GPU power consumption + case GPU + DYN: // for dynamic GPU power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaDynEnergyFromAllSources(GPU)), shouldNormalize)) - case PKG + "_IDLE": // for idle PKG power consumption + case PKG + IDLE: // for idle PKG power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaIdleEnergyFromAllSources(PKG)), shouldNormalize)) - case CORE + "_IDLE": // for idle CORE power consumption + case CORE + IDLE: // for idle CORE power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaIdleEnergyFromAllSources(CORE)), shouldNormalize)) - case DRAM + "_IDLE": // for idle DRAM power consumption + case DRAM + IDLE: // for idle DRAM power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaIdleEnergyFromAllSources(DRAM)), shouldNormalize)) - case UNCORE + "_IDLE": // for idle UNCORE power consumption + case UNCORE + IDLE: // for idle UNCORE power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaIdleEnergyFromAllSources(UNCORE)), shouldNormalize)) - case OTHER + "_IDLE": // for idle OTHER power consumption + case OTHER + IDLE: // for idle OTHER power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaIdleEnergyFromAllSources(OTHER)), shouldNormalize)) - case PLATFORM + "_IDLE": // for idle PLATFORM power consumption + case PLATFORM + IDLE: // for idle PLATFORM power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaIdleEnergyFromAllSources(PLATFORM)), shouldNormalize)) - case GPU + "_IDLE": // for idle GPU power consumption + case GPU + IDLE: // for idle GPU power consumption featureValues = append(featureValues, normalize(float64(ne.GetSumDeltaIdleEnergyFromAllSources(GPU)), shouldNormalize)) default: diff --git a/pkg/libvirt/resolve_vm.go b/pkg/libvirt/resolve_vm.go index 711d81f730..1823f755ba 100644 --- a/pkg/libvirt/resolve_vm.go +++ b/pkg/libvirt/resolve_vm.go @@ -18,7 +18,7 @@ package libvirt import ( "fmt" - "io/ioutil" + "os" "path/filepath" ) @@ -32,7 +32,7 @@ func getThreadIDsForPID(pid, fullPath string) []string { procDir := fmt.Sprintf(fullPath, pid) - files, err := ioutil.ReadDir(procDir) + files, err := os.ReadDir(procDir) if err != nil { return nil } @@ -51,7 +51,7 @@ func GetCurrentVMPID(path ...string) (map[string]string, error) { path = []string{libvirtPath, procPath} } - files, err := ioutil.ReadDir(path[0]) + files, err := os.ReadDir(path[0]) if err != nil { return nil, err } @@ -63,7 +63,7 @@ func GetCurrentVMPID(path ...string) (map[string]string, error) { if filepath.Ext(file.Name()) == ".pid" { filePath := filepath.Join(path[0], file.Name()) - content, err := ioutil.ReadFile(filePath) + content, err := os.ReadFile(filePath) if err != nil { fmt.Printf("Error reading %s: %v\n", filePath, err) continue diff --git a/pkg/libvirt/resolve_vm_test.go b/pkg/libvirt/resolve_vm_test.go index 67bfa8257b..446bb37121 100644 --- a/pkg/libvirt/resolve_vm_test.go +++ b/pkg/libvirt/resolve_vm_test.go @@ -17,7 +17,6 @@ limitations under the License. package libvirt import ( - "io/ioutil" "os" "path/filepath" "reflect" @@ -34,7 +33,7 @@ func createMockLibvirtDir(directory string) { } for _, file := range mockFiles { - err := ioutil.WriteFile(filepath.Join(directory, file.name), []byte(file.content), 0644) + err := os.WriteFile(filepath.Join(directory, file.name), []byte(file.content), 0644) if err != nil { panic(err) } diff --git a/pkg/model/benchmark_test.go b/pkg/model/benchmark_test.go index 05f7d72f91..eac241e617 100644 --- a/pkg/model/benchmark_test.go +++ b/pkg/model/benchmark_test.go @@ -81,10 +81,11 @@ func benchmarkNtesting(b *testing.B, containerNumber int) { nodeMetrics.UpdateDynEnergy() b.ReportAllocs() containersMetrics := map[string]*collector_metric.ContainerMetrics{} + const CONTAINER = "container" for n := 0; n < containerNumber; n++ { - containersMetrics["container"+strconv.Itoa(n)] = collector_metric.NewContainerMetrics("container"+strconv.Itoa(n), "podA", "test", "container"+strconv.Itoa(n)) - containersMetrics["container"+strconv.Itoa(n)].BPFStats[config.CoreUsageMetric] = &types.UInt64Stat{} - _ = containersMetrics["container"+strconv.Itoa(n)].BPFStats[config.CoreUsageMetric].AddNewDelta(30000) + containersMetrics[CONTAINER+strconv.Itoa(n)] = collector_metric.NewContainerMetrics(CONTAINER+strconv.Itoa(n), "podA", "test", CONTAINER+strconv.Itoa(n)) + containersMetrics[CONTAINER+strconv.Itoa(n)].BPFStats[config.CoreUsageMetric] = &types.UInt64Stat{} + _ = containersMetrics[CONTAINER+strconv.Itoa(n)].BPFStats[config.CoreUsageMetric].AddNewDelta(30000) } nodeMetrics.AddNodeResUsageFromContainerResUsage(containersMetrics) b.ResetTimer() diff --git a/pkg/model/container_energy.go b/pkg/model/container_energy.go index 15d0d0f8d3..a5d391d020 100644 --- a/pkg/model/container_energy.go +++ b/pkg/model/container_energy.go @@ -72,18 +72,18 @@ func createContainerPowerModelConfig(powerSourceTarget string, containerFeatureN // NodeFeatureNames contains the metrics that represents the node resource utilization plus the dynamic and idle power power consumption modelConfig.NodeFeatureNames = modelConfig.ContainerFeatureNames modelConfig.NodeFeatureNames = append(modelConfig.NodeFeatureNames, []string{ - collector_metric.PKG + "_DYN", // for dynamic PKG power consumption - collector_metric.CORE + "_DYN", // for dynamic CORE power consumption - collector_metric.DRAM + "_DYN", // for dynamic DRAM power consumption - collector_metric.UNCORE + "_DYN", // for dynamic UNCORE power consumption - collector_metric.OTHER + "_DYN", // for dynamic OTHER power consumption - collector_metric.GPU + "_DYN", // for dynamic GPU power consumption - collector_metric.PKG + "_IDLE", // for idle PKG power consumption - collector_metric.CORE + "_IDLE", // for idle CORE power consumption - collector_metric.DRAM + "_IDLE", // for idle DRAM power consumption - collector_metric.UNCORE + "_IDLE", // for idle UNCORE power consumption - collector_metric.OTHER + "_IDLE", // for idle OTHER power consumption - collector_metric.GPU + "_IDLE", // for idle GPU power consumption + collector_metric.PKG + collector_metric.DYN, // for dynamic PKG power consumption + collector_metric.CORE + collector_metric.DYN, // for dynamic CORE power consumption + collector_metric.DRAM + collector_metric.DYN, // for dynamic DRAM power consumption + collector_metric.UNCORE + collector_metric.DYN, // for dynamic UNCORE power consumption + collector_metric.OTHER + collector_metric.DYN, // for dynamic OTHER power consumption + collector_metric.GPU + collector_metric.DYN, // for dynamic GPU power consumption + collector_metric.PKG + collector_metric.IDLE, // for idle PKG power consumption + collector_metric.CORE + collector_metric.IDLE, // for idle CORE power consumption + collector_metric.DRAM + collector_metric.IDLE, // for idle DRAM power consumption + collector_metric.UNCORE + collector_metric.IDLE, // for idle UNCORE power consumption + collector_metric.OTHER + collector_metric.IDLE, // for idle OTHER power consumption + collector_metric.GPU + collector_metric.IDLE, // for idle GPU power consumption }...) } else if powerSourceTarget == config.ContainerPlatformPowerKey { platformUsageMetric := config.CoreUsageMetric @@ -96,8 +96,8 @@ func createContainerPowerModelConfig(powerSourceTarget string, containerFeatureN } modelConfig.NodeFeatureNames = modelConfig.ContainerFeatureNames modelConfig.NodeFeatureNames = append(modelConfig.NodeFeatureNames, []string{ - collector_metric.PLATFORM + "_DYN", // for dynamic PLATFORM power consumption - collector_metric.PLATFORM + "_IDLE", // for idle PLATFORM power consumption + collector_metric.PLATFORM + collector_metric.DYN, // for dynamic PLATFORM power consumption + collector_metric.PLATFORM + collector_metric.IDLE, // for idle PLATFORM power consumption }...) } } diff --git a/pkg/model/estimator/local/ratio_model_test.go b/pkg/model/estimator/local/ratio_model_test.go index d5ce5634ad..20d6d2e5a5 100644 --- a/pkg/model/estimator/local/ratio_model_test.go +++ b/pkg/model/estimator/local/ratio_model_test.go @@ -108,24 +108,24 @@ var _ = Describe("Test Ratio Unit", func() { config.GpuUsageMetric, // for GPU resource usage }, NodeFeatureNames: []string{ - config.CoreUsageMetric, // for PKG resource usage - config.CoreUsageMetric, // for CORE resource usage - config.DRAMUsageMetric, // for DRAM resource usage - config.GeneralUsageMetric, // for UNCORE resource usage - config.GeneralUsageMetric, // for OTHER resource usage - config.GpuUsageMetric, // for GPU resource usage - collector_metric.PKG + "_DYN", // for dynamic PKG power consumption - collector_metric.CORE + "_DYN", // for dynamic CORE power consumption - collector_metric.DRAM + "_DYN", // for dynamic PKG power consumption - collector_metric.UNCORE + "_DYN", // for dynamic UNCORE power consumption - collector_metric.OTHER + "_DYN", // for dynamic OTHER power consumption - collector_metric.GPU + "_DYN", // for dynamic GPU power consumption - collector_metric.PKG + "_IDLE", // for idle PKG power consumption - collector_metric.CORE + "_IDLE", // for idle CORE power consumption - collector_metric.DRAM + "_IDLE", // for idle PKG power consumption - collector_metric.UNCORE + "_IDLE", // for idle UNCORE power consumption - collector_metric.OTHER + "_IDLE", // for idle OTHER power consumption - collector_metric.GPU + "_IDLE", // for idle GPU power consumption + config.CoreUsageMetric, // for PKG resource usage + config.CoreUsageMetric, // for CORE resource usage + config.DRAMUsageMetric, // for DRAM resource usage + config.GeneralUsageMetric, // for UNCORE resource usage + config.GeneralUsageMetric, // for OTHER resource usage + config.GpuUsageMetric, // for GPU resource usage + collector_metric.PKG + collector_metric.DYN, // for dynamic PKG power consumption + collector_metric.CORE + collector_metric.DYN, // for dynamic CORE power consumption + collector_metric.DRAM + collector_metric.DYN, // for dynamic PKG power consumption + collector_metric.UNCORE + collector_metric.DYN, // for dynamic UNCORE power consumption + collector_metric.OTHER + collector_metric.DYN, // for dynamic OTHER power consumption + collector_metric.GPU + collector_metric.DYN, // for dynamic GPU power consumption + collector_metric.PKG + collector_metric.IDLE, // for idle PKG power consumption + collector_metric.CORE + collector_metric.IDLE, // for idle CORE power consumption + collector_metric.DRAM + collector_metric.IDLE, // for idle PKG power consumption + collector_metric.UNCORE + collector_metric.IDLE, // for idle UNCORE power consumption + collector_metric.OTHER + collector_metric.IDLE, // for idle OTHER power consumption + collector_metric.GPU + collector_metric.IDLE, // for idle GPU power consumption }, } model.ResetSampleIdx() diff --git a/pkg/model/process_power.go b/pkg/model/process_power.go index 706e743154..059029f729 100644 --- a/pkg/model/process_power.go +++ b/pkg/model/process_power.go @@ -71,18 +71,18 @@ func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames // NodeFeatureNames contains the metrics that represents the node resource utilization plus the dynamic and idle power power consumption modelConfig.NodeFeatureNames = modelConfig.ContainerFeatureNames modelConfig.NodeFeatureNames = append(modelConfig.NodeFeatureNames, []string{ - collector_metric.PKG + "_DYN", // for dynamic PKG power consumption - collector_metric.CORE + "_DYN", // for dynamic CORE power consumption - collector_metric.DRAM + "_DYN", // for dynamic DRAM power consumption - collector_metric.UNCORE + "_DYN", // for dynamic UNCORE power consumption - collector_metric.OTHER + "_DYN", // for dynamic OTHER power consumption - collector_metric.GPU + "_DYN", // for dynamic GPU power consumption - collector_metric.PKG + "_IDLE", // for idle PKG power consumption - collector_metric.CORE + "_IDLE", // for idle CORE power consumption - collector_metric.DRAM + "_IDLE", // for idle DRAM power consumption - collector_metric.UNCORE + "_IDLE", // for idle UNCORE power consumption - collector_metric.OTHER + "_IDLE", // for idle OTHER power consumption - collector_metric.GPU + "_IDLE", // for idle GPU power consumption + collector_metric.PKG + collector_metric.DYN, // for dynamic PKG power consumption + collector_metric.CORE + collector_metric.DYN, // for dynamic CORE power consumption + collector_metric.DRAM + collector_metric.DYN, // for dynamic DRAM power consumption + collector_metric.UNCORE + collector_metric.DYN, // for dynamic UNCORE power consumption + collector_metric.OTHER + collector_metric.DYN, // for dynamic OTHER power consumption + collector_metric.GPU + collector_metric.DYN, // for dynamic GPU power consumption + collector_metric.PKG + collector_metric.IDLE, // for idle PKG power consumption + collector_metric.CORE + collector_metric.IDLE, // for idle CORE power consumption + collector_metric.DRAM + collector_metric.IDLE, // for idle DRAM power consumption + collector_metric.UNCORE + collector_metric.IDLE, // for idle UNCORE power consumption + collector_metric.OTHER + collector_metric.IDLE, // for idle OTHER power consumption + collector_metric.GPU + collector_metric.IDLE, // for idle GPU power consumption }...) } else if powerSourceTarget == config.ProcessPlatformPowerKey { platformUsageMetric := config.CoreUsageMetric @@ -95,8 +95,8 @@ func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames } modelConfig.NodeFeatureNames = modelConfig.ContainerFeatureNames modelConfig.NodeFeatureNames = append(modelConfig.NodeFeatureNames, []string{ - collector_metric.PLATFORM + "_DYN", // for dynamic PLATFORM power consumption - collector_metric.PLATFORM + "_IDLE", // for idle PLATFORM power consumption + collector_metric.PLATFORM + collector_metric.DYN, // for dynamic PLATFORM power consumption + collector_metric.PLATFORM + collector_metric.IDLE, // for idle PLATFORM power consumption }...) } } diff --git a/pkg/power/platform/source/redfish_util.go b/pkg/power/platform/source/redfish_util.go index ecd13771aa..d73cb8deb4 100644 --- a/pkg/power/platform/source/redfish_util.go +++ b/pkg/power/platform/source/redfish_util.go @@ -22,7 +22,6 @@ import ( "encoding/json" "fmt" "io" - "io/ioutil" "net/http" "strings" "time" @@ -70,7 +69,7 @@ func getRedfishModel(access RedfishAccessInfo, endpoint string, model interface{ return err } defer func() { - if _, err := io.Copy(ioutil.Discard, resp.Body); err != nil { + if _, err := io.Copy(io.Discard, resp.Body); err != nil { klog.V(0).Infof("Failed to discard response body: %v", err) } resp.Body.Close()