Skip to content

Commit

Permalink
feat(host,monitor): 支持容器相关gpu设备监控采集 (#21588)
Browse files Browse the repository at this point in the history
  • Loading branch information
zexi authored Nov 14, 2024
1 parent 5034406 commit bdc9d8e
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 11 deletions.
49 changes: 49 additions & 0 deletions pkg/hostman/hostinfo/hostinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -2496,6 +2496,10 @@ func (h *SHostInfo) OnCatalogChanged(catalog mcclient.KeystoneServiceCatalogV3)
}
}

if h.IsContainerHost() {
h.injectTelegrafDeviceConfig(conf)
}

tsdb, _ := tsdb.GetDefaultServiceSource(s, defaultEndpointType)
if tsdb != nil && len(tsdb.URLs) > 0 {
conf[apis.SERVICE_TYPE_INFLUXDB] = map[string]interface{}{
Expand Down Expand Up @@ -2529,6 +2533,51 @@ func (h *SHostInfo) OnCatalogChanged(catalog mcclient.KeystoneServiceCatalogV3)
}*/
}

func (h *SHostInfo) injectTelegrafDeviceConfig(conf map[string]interface{}) {
devs := h.GetIsolatedDeviceManager().GetDevices()
if len(devs) == 0 {
return
}
// group dev
hasNetint := false
hasVasmi := false
for _, dev := range devs {
devType := dev.GetDeviceType()
switch devType {
case string(isolated_device.ContainerDeviceTypeCphAMDGPU):
confMap, ok := conf[system_service.TELEGRAF_INPUT_RADEONTOP].(map[string]interface{})
if !ok {
conf[system_service.TELEGRAF_INPUT_RADEONTOP] = map[string]interface{}{
system_service.TELEGRAF_INPUT_CONF_BIN_PATH: "/usr/bin/radeontop",
system_service.TELEGRAF_INPUT_RADEONTOP_DEV_PATHS: []string{dev.GetDevicePath()},
}
} else {
devPaths := confMap[system_service.TELEGRAF_INPUT_RADEONTOP_DEV_PATHS].([]string)
if !utils.IsInStringArray(dev.GetDevicePath(), devPaths) {
devPaths = append(devPaths, dev.GetDevicePath())
confMap[system_service.TELEGRAF_INPUT_RADEONTOP_DEV_PATHS] = devPaths
}
}
case string(isolated_device.ContainerNetintCAQuadra), string(isolated_device.ContainerNetintCAASIC):
hasNetint = true
continue
case string(isolated_device.ContainerDeviceTypeVastaitechGpu):
hasVasmi = true
continue
}
}
if hasNetint {
conf[system_service.TELEGAF_INPUT_NETDEV] = map[string]interface{}{
system_service.TELEGRAF_INPUT_CONF_BIN_PATH: "/usr/bin/ni_rsrc_mon",
}
}
if hasVasmi {
conf[system_service.TELEGAF_INPUT_VASMI] = map[string]interface{}{
system_service.TELEGRAF_INPUT_CONF_BIN_PATH: "/usr/bin/vasmi",
}
}
}

func (h *SHostInfo) getNicsTelegrafConf() []map[string]interface{} {
var ret = make([]map[string]interface{}, 0)
existing := make(map[string]struct{})
Expand Down
35 changes: 35 additions & 0 deletions pkg/hostman/system_service/telegraf.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ import (
"yunion.io/x/onecloud/pkg/util/procutils"
)

const (
TELEGRAF_INPUT_RADEONTOP = "radeontop"
TELEGRAF_INPUT_RADEONTOP_DEV_PATHS = "device_paths"
TELEGRAF_INPUT_CONF_BIN_PATH = "bin_path"
TELEGAF_INPUT_NETDEV = "ni_rsrc_mon"
TELEGAF_INPUT_VASMI = "vasmi"
)

type STelegraf struct {
*SBaseSystemService
}
Expand Down Expand Up @@ -289,6 +297,33 @@ func (s *STelegraf) GetConfig(kwargs map[string]interface{}) string {
conf += " keep_field_names = true\n"
conf += "\n"
}

if radontop, ok := kwargs[TELEGRAF_INPUT_RADEONTOP]; ok {
radontopMap, _ := radontop.(map[string]interface{})
devPaths := radontopMap[TELEGRAF_INPUT_RADEONTOP_DEV_PATHS].([]string)
devPathStr := make([]string, len(devPaths))
for i, devPath := range devPaths {
devPathStr[i] = fmt.Sprintf("\"%s\"", devPath)
}
conf += fmt.Sprintf("[[inputs.%s]]\n", TELEGRAF_INPUT_RADEONTOP)
conf += fmt.Sprintf(" bin_path = \"%s\"\n", radontopMap[TELEGRAF_INPUT_CONF_BIN_PATH].(string))
conf += fmt.Sprintf(" %s = [%s]\n", TELEGRAF_INPUT_RADEONTOP_DEV_PATHS, strings.Join(devPathStr, ", "))
conf += "\n"
}

if netdev, ok := kwargs[TELEGAF_INPUT_NETDEV]; ok {
netdevMap, _ := netdev.(map[string]interface{})
conf += fmt.Sprintf("[[inputs.%s]]\n", TELEGAF_INPUT_NETDEV)
conf += fmt.Sprintf(" bin_path = \"%s\"\n", netdevMap[TELEGRAF_INPUT_CONF_BIN_PATH].(string))
conf += "\n"
}

if vasmi, ok := kwargs[TELEGAF_INPUT_VASMI]; ok {
vasmiMap, _ := vasmi.(map[string]interface{})
conf += fmt.Sprintf("[[inputs.%s]]\n", TELEGAF_INPUT_VASMI)
conf += fmt.Sprintf(" bin_path = \"%s\"\n", vasmiMap[TELEGRAF_INPUT_CONF_BIN_PATH].(string))
conf += "\n"
}
return conf
}

Expand Down
85 changes: 74 additions & 11 deletions pkg/monitor/dbinit/metric_dbinit.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,20 @@ func registryNetio(measurement string, displayName string, resType string, score
})
}

func registryNvidaSMI(measurement string, resType string, score int) {
RegistryMetricCreateInput(measurement, "Nvidia GPU metrics",
resType, monitor.METRIC_DATABASE_TELE, score, []monitor.MetricFieldCreateInput{
newMetricFieldCreateInput("clocks_current_graphics", "GPU current clocks, MHz", "", 1),
newMetricFieldCreateInput("clocks_current_memory", "GPU current memory clocks, MHz", "", 2),
newMetricFieldCreateInput("temperature_gpu", "GPU temperature", "", 3),
newMetricFieldCreateInput("memory_total", "GPU memory total size", "", 4),
newMetricFieldCreateInput("memory_free", "GPU memory free size", "", 5),
newMetricFieldCreateInput("memory_used", "GPU memory used size", "", 6),
newMetricFieldCreateInput("utilization_gpu", "GPU utilization", monitor.METRIC_UNIT_PERCENT, 7),
newMetricFieldCreateInput("utilization_memory", "GPU memory utilization", monitor.METRIC_UNIT_PERCENT, 8),
})
}

// order by score asc
// score default:99
func init() {
Expand Down Expand Up @@ -178,6 +192,65 @@ func init() {
newMetricFieldCreateInput("running", "Running processes count", monitor.METRIC_UNIT_COUNT, 4),
newMetricFieldCreateInput("sleeping", "Sleeping processes count", monitor.METRIC_UNIT_COUNT, 5),
})
// gpu
registryNvidaSMI("nvidia_smi", monitor.METRIC_RES_TYPE_HOST, 8)

RegistryMetricCreateInput("radeontop", "AMD GPU metrics", monitor.METRIC_RES_TYPE_HOST, monitor.METRIC_DATABASE_TELE, 9,
[]monitor.MetricFieldCreateInput{
newMetricFieldCreateInput("clocks_current_memory", "GPU current memory clocks, MHz", "", 1),
newMetricFieldCreateInput("clocks_current_shader", "GPU current shader clocks, MHz", "", 2),
newMetricFieldCreateInput("memory_total", "GPU memory total size", "", 3),
newMetricFieldCreateInput("memory_free", "GPU memory free size", "", 4),
newMetricFieldCreateInput("memory_used", "GPU memory used size", "", 5),
newMetricFieldCreateInput("gtt_total", "GPU gtt total size", "", 6),
newMetricFieldCreateInput("gtt_free", "GPU gtt free size", "", 7),
newMetricFieldCreateInput("gtt_used", "GPU gtt used size", "", 8),
newMetricFieldCreateInput("utilization_clock_memory", "GPU block memory utilization", monitor.METRIC_UNIT_PERCENT, 9),
newMetricFieldCreateInput("utilization_clock_shader", "GPU block shader utilization", monitor.METRIC_UNIT_PERCENT, 10),
newMetricFieldCreateInput("utilization_gpu", "GPU utilization", monitor.METRIC_UNIT_PERCENT, 11),
newMetricFieldCreateInput("utilization_memory", "GPU memory utilization", monitor.METRIC_UNIT_PERCENT, 12),
newMetricFieldCreateInput("utilization_event_engine", "GPU event engine utilization", monitor.METRIC_UNIT_PERCENT, 13),
newMetricFieldCreateInput("utilization_vertex_grouper_tesselator", "GPU vertex grouper tesselator utilization", monitor.METRIC_UNIT_PERCENT, 14),
newMetricFieldCreateInput("utilization_texture_addresser", "GPU texture addresser utilization", monitor.METRIC_UNIT_PERCENT, 15),
newMetricFieldCreateInput("utilization_shader_exporter", "GPU shader export utilization", monitor.METRIC_UNIT_PERCENT, 16),
newMetricFieldCreateInput("utilization_sequencer_instruction_cache", "GPU sequencer instruction cache utilization", monitor.METRIC_UNIT_PERCENT, 17),
newMetricFieldCreateInput("utilization_shader_interpolator", "GPU shader interpolator utilization", monitor.METRIC_UNIT_PERCENT, 18),
newMetricFieldCreateInput("utilization_scan_converter", "GPU scan converter utilization", monitor.METRIC_UNIT_PERCENT, 19),
newMetricFieldCreateInput("utilization_primitive_assembly", "GPU primitive assembly utilization", monitor.METRIC_UNIT_PERCENT, 20),
newMetricFieldCreateInput("utilization_depth_block", "GPU depth block utilization", monitor.METRIC_UNIT_PERCENT, 21),
newMetricFieldCreateInput("utilization_color_block", "GPU color block utilization", monitor.METRIC_UNIT_PERCENT, 22),
})

RegistryMetricCreateInput("vasmi", "Vasmi GPU metrics",
monitor.METRIC_RES_TYPE_HOST, monitor.METRIC_DATABASE_TELE, 11, []monitor.MetricFieldCreateInput{
newMetricFieldCreateInput("temperature_gpu", "GPU temperature", "", 1),
newMetricFieldCreateInput("utilization_gpu", "GPU utilization", monitor.METRIC_UNIT_PERCENT, 2),
newMetricFieldCreateInput("utilization_memory", "GPU memory utilization", monitor.METRIC_UNIT_PERCENT, 3),
newMetricFieldCreateInput("utilization_share_memory", "GPU memory utilization", monitor.METRIC_UNIT_PERCENT, 3),
newMetricFieldCreateInput("utilization_encoder", "GPU encoder utilization", monitor.METRIC_UNIT_PERCENT, 4),
newMetricFieldCreateInput("utilization_decoder", "GPU decoder utilization", monitor.METRIC_UNIT_PERCENT, 5),
newMetricFieldCreateInput("utilization_ai", "GPU AI utilization", monitor.METRIC_UNIT_PERCENT, 6),
newMetricFieldCreateInput("clocks_current_gpu", "GPU current clocks, MHz", "", 7),
newMetricFieldCreateInput("oclk", "oclk, MHz", "", 8),
newMetricFieldCreateInput("dclk", "dclk, MHz", "", 9),
newMetricFieldCreateInput("eclk", "eclk, MHz", "", 10),
newMetricFieldCreateInput("gclk", "gclk, MHz", "", 11),
newMetricFieldCreateInput("aic_power", "AIC power", "", 12),
})

RegistryMetricCreateInput("ni_rsrc_mon", "NETINT device metrics",
monitor.METRIC_RES_TYPE_HOST, monitor.METRIC_DATABASE_TELE, 12,
[]monitor.MetricFieldCreateInput{
newMetricFieldCreateInput("load", "Load utilization", monitor.METRIC_UNIT_PERCENT, 1),
newMetricFieldCreateInput("model_load", "Model load utilization", monitor.METRIC_UNIT_PERCENT, 2),
newMetricFieldCreateInput("fw_load", "FW load utilization", monitor.METRIC_UNIT_PERCENT, 3),
newMetricFieldCreateInput("inst", "INST utilization", monitor.METRIC_UNIT_PERCENT, 4),
newMetricFieldCreateInput("max_inst", "MAX INST utilization", monitor.METRIC_UNIT_PERCENT, 5),
newMetricFieldCreateInput("mem", "Memory utilization", monitor.METRIC_UNIT_PERCENT, 6),
newMetricFieldCreateInput("critical_mem", "Critical memory utilization", monitor.METRIC_UNIT_PERCENT, 7),
newMetricFieldCreateInput("share_mem", "Share memory utilization", monitor.METRIC_UNIT_PERCENT, 8),
newMetricFieldCreateInput("p2p_mem", "P2P memory utilization", monitor.METRIC_UNIT_PERCENT, 9),
})

// vm_cpu
RegistryMetricCreateInput("vm_cpu", "Guest CPU usage", monitor.METRIC_RES_TYPE_GUEST,
Expand Down Expand Up @@ -421,17 +494,7 @@ func init() {
})

// agent nvidia_smi
RegistryMetricCreateInput("agent_nvidia_smi", "Collect Nvidia GPU metrics",
monitor.METRIC_RES_TYPE_AGENT, monitor.METRIC_DATABASE_TELE, 8, []monitor.MetricFieldCreateInput{
newMetricFieldCreateInput("clocks_current_graphics", "GPU current clocks, MHz", "", 1),
newMetricFieldCreateInput("clocks_current_memory", "GPU current memory clocks, MHz", "", 2),
newMetricFieldCreateInput("temperature_gpu", "GPU temperature", "", 3),
newMetricFieldCreateInput("memory_total", "GPU memory total size", "", 4),
newMetricFieldCreateInput("memory_free", "GPU memory free size", "", 5),
newMetricFieldCreateInput("memory_used", "GPU memory used size", "", 6),
newMetricFieldCreateInput("utilization_gpu", "GPU utilization", "", 7),
newMetricFieldCreateInput("utilization_memory", "GPU memory utilization", "", 8),
})
registryNvidaSMI("agent_nvidia_smi", monitor.METRIC_RES_TYPE_AGENT, 8)

RegistryMetricCreateInput("storage", "Storage usage",
monitor.METRIC_RES_TYPE_STORAGE, monitor.METRIC_DATABASE_TELE, 1, []monitor.MetricFieldCreateInput{
Expand Down
5 changes: 5 additions & 0 deletions pkg/monitor/models/datasource.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"yunion.io/x/log"
"yunion.io/x/pkg/errors"
"yunion.io/x/pkg/tristate"
"yunion.io/x/pkg/util/sets"
"yunion.io/x/pkg/utils"

"yunion.io/x/onecloud/pkg/apis/monitor"
Expand Down Expand Up @@ -850,6 +851,10 @@ func floatEquals(a, b float64) bool {
var filterKey = []string{"perf_instance", "res_type", "status", "cloudregion", "os_type", "is_vm"}

func filterTagKey(key string) bool {
whiteListIdKeys := sets.NewString("dev_id", "die_id")
if whiteListIdKeys.Has(key) {
return false
}
if strings.Contains(key, "_id") {
return true
}
Expand Down

0 comments on commit bdc9d8e

Please sign in to comment.