From 123aa8e6ed7cf172eb43b7c3898b316e7c28b1e6 Mon Sep 17 00:00:00 2001 From: Austin Georgiades <34201358+algchoo@users.noreply.github.com> Date: Thu, 1 Aug 2024 07:07:24 -0400 Subject: [PATCH] Apache Hadoop `k8s` plugin support (#1288) * update datanode dashboard to support k8s * update namenode dashboard for k8s support * update nodemanager dashboard for k8s support * update resourcemanager dashboard for k8s support * add multiclusterSelector * moved hadoop_cluster label to getMatcher method * add hadoop_cluster to getMatcher for all dashboards --- apache-hadoop-mixin/config.libsonnet | 3 + .../hadoop-datanode-overview.libsonnet | 42 +++-- .../hadoop-namenode-overview.libsonnet | 84 ++++++---- .../hadoop-nodemanager-overview.libsonnet | 150 ++++++++++-------- .../hadoop-resourcemanager-overview.libsonnet | 100 +++++++----- 5 files changed, 219 insertions(+), 160 deletions(-) diff --git a/apache-hadoop-mixin/config.libsonnet b/apache-hadoop-mixin/config.libsonnet index 86611e8e1..309443c48 100644 --- a/apache-hadoop-mixin/config.libsonnet +++ b/apache-hadoop-mixin/config.libsonnet @@ -16,5 +16,8 @@ alertsCriticalResourceManagerMemoryUsage: 80, // % enableLokiLogs: true, + enableMultiCluster: false, + multiclusterSelector: 'job=~"$job"', + hadoopSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"', }, } diff --git a/apache-hadoop-mixin/dashboards/hadoop-datanode-overview.libsonnet b/apache-hadoop-mixin/dashboards/hadoop-datanode-overview.libsonnet index bf735efe3..2a49d743e 100644 --- a/apache-hadoop-mixin/dashboards/hadoop-datanode-overview.libsonnet +++ b/apache-hadoop-mixin/dashboards/hadoop-datanode-overview.libsonnet @@ -8,6 +8,8 @@ local dashboardUid = 'apache-hadoop-datanode-overview'; local promDatasourceName = 'prometheus_datasource'; local lokiDatasourceName = 'loki_datasource'; +local getMatcher(cfg) = '%(hadoopSelector)s, instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"' % cfg; + local promDatasource = { uid: '${%s}' % promDatasourceName, }; @@ -30,11 +32,11 @@ local datanodesRow = { collapsed: false, }; -local unreadBlocksEvictedPanel = { +local unreadBlocksEvictedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_datanode_ramdiskblocksevictedwithoutread{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:])', + 'increase(hadoop_datanode_ramdiskblocksevictedwithoutread{' + matcher + '}[$__interval:])', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -107,11 +109,11 @@ local unreadBlocksEvictedPanel = { }, }; -local blocksRemovedPanel = { +local blocksRemovedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_datanode_blocksremoved{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:])', + 'increase(hadoop_datanode_blocksremoved{' + matcher + '}[$__interval:])', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -184,11 +186,11 @@ local blocksRemovedPanel = { }, }; -local volumeFailuresPanel = { +local volumeFailuresPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_datanode_volumefailures{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:])', + 'increase(hadoop_datanode_volumefailures{' + matcher + '}[$__interval:])', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -261,13 +263,13 @@ local volumeFailuresPanel = { }, }; -local datanodeLogsPanel = { +local datanodeLogsPanel(matcher) = { datasource: lokiDatasource, targets: [ { datasource: lokiDatasource, editorMode: 'code', - expr: '{job=~"$job", hadoop_cluster=~"$hadoop_cluster", instance=~"$instance", filename=~".*/hadoop/logs/.*-datanode.*.log"} |= ``', + expr: '{' + matcher + '} |= `` | (filename=~".*/hadoop/logs/.*-datanode.*.log" or log_type="datanode")', queryType: 'range', refId: 'A', }, @@ -338,10 +340,22 @@ local datanodeLogsPanel = { allValues='.+', sort=1 ), + template.new( + 'cluster', + promDatasource, + 'label_values(hadoop_datanode_ramdiskblocksevictedwithoutread{%(multiclusterSelector)s}, cluster)' % $._config, + label='Cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='.*', + hide=if $._config.enableMultiCluster then '' else 'variable', + sort=0 + ), template.new( 'instance', promDatasource, - 'label_values(hadoop_datanode_ramdiskblocksevictedwithoutread{job=~"$job"}, instance)', + 'label_values(hadoop_datanode_ramdiskblocksevictedwithoutread{%(hadoopSelector)s}, instance)' % $._config, label='Instance', refresh=2, includeAll=true, @@ -352,7 +366,7 @@ local datanodeLogsPanel = { template.new( 'hadoop_cluster', promDatasource, - 'label_values(hadoop_datanode_ramdiskblocksevictedwithoutread{job=~"$job"}, hadoop_cluster)', + 'label_values(hadoop_datanode_ramdiskblocksevictedwithoutread{%(hadoopSelector)s}, hadoop_cluster)' % $._config, label='Hadoop cluster', refresh=2, includeAll=true, @@ -367,12 +381,12 @@ local datanodeLogsPanel = { std.flattenArrays([ [ datanodesRow { gridPos: { h: 1, w: 24, x: 0, y: 0 } }, - unreadBlocksEvictedPanel { gridPos: { h: 6, w: 8, x: 0, y: 1 } }, - blocksRemovedPanel { gridPos: { h: 6, w: 8, x: 8, y: 1 } }, - volumeFailuresPanel { gridPos: { h: 6, w: 8, x: 16, y: 1 } }, + unreadBlocksEvictedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 0, y: 1 } }, + blocksRemovedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 8, y: 1 } }, + volumeFailuresPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 16, y: 1 } }, ], if $._config.enableLokiLogs then [ - datanodeLogsPanel { gridPos: { h: 8, w: 24, x: 0, y: 7 } }, + datanodeLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 7 } }, ] else [], [ ], diff --git a/apache-hadoop-mixin/dashboards/hadoop-namenode-overview.libsonnet b/apache-hadoop-mixin/dashboards/hadoop-namenode-overview.libsonnet index 91c39faa7..9cc0dac6d 100644 --- a/apache-hadoop-mixin/dashboards/hadoop-namenode-overview.libsonnet +++ b/apache-hadoop-mixin/dashboards/hadoop-namenode-overview.libsonnet @@ -8,6 +8,8 @@ local dashboardUid = 'apache-hadoop-namenode-overview'; local promDatasourceName = 'prometheus_datasource'; local lokiDatasourceName = 'loki_datasource'; +local getMatcher(cfg) = '%(hadoopSelector)s, instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"' % cfg; + local promDatasource = { uid: '${%s}' % promDatasourceName, }; @@ -16,26 +18,26 @@ local lokiDatasource = { uid: '${%s}' % lokiDatasourceName, }; -local datanodeStatePanel = { +local datanodeStatePanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_namenode_numlivedatanodes{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_numlivedatanodes{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - live DataNodes', ), prometheus.target( - 'hadoop_namenode_numdeaddatanodes{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_numdeaddatanodes{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - dead DataNodes', ), prometheus.target( - 'hadoop_namenode_numstaledatanodes{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_namenode_numstaledatanodes{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - stale DataNodes', ), prometheus.target( - 'hadoop_namenode_numdecommissioningdatanodes{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_numdecommissioningdatanodes{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - decommissioning DataNodes', ), @@ -80,11 +82,11 @@ local datanodeStatePanel = { }, }; -local capacityUtilizationPanel = { +local capacityUtilizationPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - '100 * hadoop_namenode_capacityused{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"} / clamp_min(hadoop_namenode_capacitytotal{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}, 1)', + '100 * hadoop_namenode_capacityused{' + matcher + ', name="FSNamesystem"} / clamp_min(hadoop_namenode_capacitytotal{' + matcher + ', name="FSNamesystem"}, 1)', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -156,11 +158,11 @@ local capacityUtilizationPanel = { }, }; -local totalBlocksPanel = { +local totalBlocksPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_namenode_blockstotal{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_blockstotal{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -232,11 +234,11 @@ local totalBlocksPanel = { }, }; -local missingBlocksPanel = { +local missingBlocksPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_namenode_missingblocks{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_missingblocks{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -308,11 +310,11 @@ local missingBlocksPanel = { }, }; -local underreplicatedBlocksPanel = { +local underreplicatedBlocksPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_namenode_underreplicatedblocks{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_underreplicatedblocks{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -384,11 +386,11 @@ local underreplicatedBlocksPanel = { }, }; -local transactionsSinceLastCheckpointPanel = { +local transactionsSinceLastCheckpointPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_namenode_transactionssincelastcheckpoint{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_transactionssincelastcheckpoint{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -460,11 +462,11 @@ local transactionsSinceLastCheckpointPanel = { }, }; -local volumeFailuresPanel = { +local volumeFailuresPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_namenode_volumefailurestotal{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}[$__interval:])', + 'increase(hadoop_namenode_volumefailurestotal{' + matcher + ', name="FSNamesystem"}[$__interval:])', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -537,11 +539,11 @@ local volumeFailuresPanel = { }, }; -local totalFilesPanel = { +local totalFilesPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_namenode_filestotal{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_filestotal{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -613,11 +615,11 @@ local totalFilesPanel = { }, }; -local totalLoadPanel = { +local totalLoadPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_namenode_totalload{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="FSNamesystem"}', + 'hadoop_namenode_totalload{' + matcher + ', name="FSNamesystem"}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -689,13 +691,13 @@ local totalLoadPanel = { }, }; -local namenodeLogsPanel = { +local namenodeLogsPanel(matcher) = { datasource: lokiDatasource, targets: [ { datasource: lokiDatasource, editorMode: 'code', - expr: '{job=~"$job", hadoop_cluster=~"$hadoop_cluster", instance=~"$instance", filename=~".*/hadoop/logs/.*-namenode.*.log"} |= ``', + expr: '{' + matcher + '} |= `` | (filename=~".*/hadoop/logs/.*-namenode.*.log" or log_type="namenode")', queryType: 'range', refId: 'A', }, @@ -766,10 +768,22 @@ local namenodeLogsPanel = { allValues='.+', sort=1 ), + template.new( + 'cluster', + promDatasource, + 'label_values(hadoop_namenode_blockstotal{%(multiclusterSelector)s}, cluster)' % $._config, + label='Cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='.*', + hide=if $._config.enableMultiCluster then '' else 'variable', + sort=0 + ), template.new( 'instance', promDatasource, - 'label_values(hadoop_namenode_blockstotal{job=~"$job"}, instance)', + 'label_values(hadoop_namenode_blockstotal{%(hadoopSelector)s}, instance)' % $._config, label='Instance', refresh=2, includeAll=true, @@ -780,7 +794,7 @@ local namenodeLogsPanel = { template.new( 'hadoop_cluster', promDatasource, - 'label_values(hadoop_namenode_blockstotal{job=~"$job"}, hadoop_cluster)', + 'label_values(hadoop_namenode_blockstotal{%(hadoopSelector)s}, hadoop_cluster)' % $._config, label='Hadoop cluster', refresh=2, includeAll=true, @@ -794,18 +808,18 @@ local namenodeLogsPanel = { .addPanels( std.flattenArrays([ [ - datanodeStatePanel { gridPos: { h: 9, w: 12, x: 0, y: 0 } }, - capacityUtilizationPanel { gridPos: { h: 9, w: 12, x: 12, y: 0 } }, - totalBlocksPanel { gridPos: { h: 6, w: 8, x: 0, y: 9 } }, - missingBlocksPanel { gridPos: { h: 6, w: 8, x: 8, y: 9 } }, - underreplicatedBlocksPanel { gridPos: { h: 6, w: 8, x: 16, y: 9 } }, - transactionsSinceLastCheckpointPanel { gridPos: { h: 6, w: 12, x: 0, y: 15 } }, - volumeFailuresPanel { gridPos: { h: 6, w: 12, x: 12, y: 15 } }, - totalFilesPanel { gridPos: { h: 6, w: 12, x: 0, y: 21 } }, - totalLoadPanel { gridPos: { h: 6, w: 12, x: 12, y: 21 } }, + datanodeStatePanel(getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 0, y: 0 } }, + capacityUtilizationPanel(getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 12, y: 0 } }, + totalBlocksPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 0, y: 9 } }, + missingBlocksPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 8, y: 9 } }, + underreplicatedBlocksPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 16, y: 9 } }, + transactionsSinceLastCheckpointPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 15 } }, + volumeFailuresPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 15 } }, + totalFilesPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 21 } }, + totalLoadPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 21 } }, ], if $._config.enableLokiLogs then [ - namenodeLogsPanel { gridPos: { h: 8, w: 24, x: 0, y: 27 } }, + namenodeLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 27 } }, ] else [], [ ], diff --git a/apache-hadoop-mixin/dashboards/hadoop-nodemanager-overview.libsonnet b/apache-hadoop-mixin/dashboards/hadoop-nodemanager-overview.libsonnet index 10edbf8b4..ccf10d32a 100644 --- a/apache-hadoop-mixin/dashboards/hadoop-nodemanager-overview.libsonnet +++ b/apache-hadoop-mixin/dashboards/hadoop-nodemanager-overview.libsonnet @@ -8,6 +8,8 @@ local dashboardUid = 'apache-hadoop-nodemanager-overview'; local promDatasourceName = 'prometheus_datasource'; local lokiDatasourceName = 'loki_datasource'; +local getMatcher(cfg) = '%(hadoopSelector)s, instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"' % cfg; + local promDatasource = { uid: '${%s}' % promDatasourceName, }; @@ -16,11 +18,11 @@ local lokiDatasource = { uid: '${%s}' % lokiDatasourceName, }; -local applicationsRunningPanel = { +local applicationsRunningPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_applicationsrunning{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_applicationsrunning{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -64,11 +66,11 @@ local applicationsRunningPanel = { pluginVersion: '10.0.2-cloud.1.94a6f396', }; -local allocatedContainersPanel = { +local allocatedContainersPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_allocatedcontainers{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_allocatedcontainers{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -112,11 +114,11 @@ local allocatedContainersPanel = { pluginVersion: '10.0.2-cloud.1.94a6f396', }; -local containersLocalizationDurationPanel = { +local containersLocalizationDurationPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_localizationdurationmillisavgtime{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_localizationdurationmillisavgtime{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -160,11 +162,11 @@ local containersLocalizationDurationPanel = { pluginVersion: '10.0.2-cloud.1.94a6f396', }; -local containersLaunchDurationPanel = { +local containersLaunchDurationPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_containerlaunchdurationavgtime{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_containerlaunchdurationavgtime{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -222,17 +224,17 @@ local jvmRow = { collapsed: false, }; -local memoryUsedPanel = { +local memoryUsedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_memheapusedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memheapusedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - heap', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_memnonheapusedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memnonheapusedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - nonheap', format='time_series', @@ -303,17 +305,17 @@ local memoryUsedPanel = { }, }; -local memoryCommittedPanel = { +local memoryCommittedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_memheapcommittedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memheapcommittedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - heap', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_memnonheapcommittedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memnonheapcommittedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - nonheap', format='time_series', @@ -384,11 +386,11 @@ local memoryCommittedPanel = { }, }; -local garbageCollectionCountPanel = { +local garbageCollectionCountPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_nodemanager_gccount{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:])', + 'increase(hadoop_nodemanager_gccount{' + matcher + '}[$__interval:])', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -460,11 +462,11 @@ local garbageCollectionCountPanel = { }, }; -local averageGarbageCollectionTimePanel = { +local averageGarbageCollectionTimePanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_nodemanager_gctimemillis{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:]) / clamp_min(increase(hadoop_nodemanager_gccount{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:]), 1)', + 'increase(hadoop_nodemanager_gctimemillis{' + matcher + '}[$__interval:]) / clamp_min(increase(hadoop_nodemanager_gccount{' + matcher + '}[$__interval:]), 1)', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -550,17 +552,17 @@ local nodeRow = { collapsed: false, }; -local nodeMemoryUsedPanel = { +local nodeMemoryUsedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_memheapusedm{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memheapusedm{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - heap', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_memnonheapusedm{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memnonheapusedm{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - nonheap', ), @@ -630,17 +632,17 @@ local nodeMemoryUsedPanel = { }, }; -local nodeMemoryCommittedPanel = { +local nodeMemoryCommittedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_memheapcommittedm{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memheapcommittedm{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - heap', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_memnonheapcommittedm{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_memnonheapcommittedm{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - nonheap', ), @@ -710,11 +712,11 @@ local nodeMemoryCommittedPanel = { }, }; -local nodeCPUUtilizationPanel = { +local nodeCPUUtilizationPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - '100 * hadoop_nodemanager_nodecpuutilization{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + '100 * hadoop_nodemanager_nodecpuutilization{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -785,11 +787,11 @@ local nodeCPUUtilizationPanel = { }, }; -local nodeGPUUtilizationPanel = { +local nodeGPUUtilizationPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - '100 * hadoop_nodemanager_nodegpuutilization{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + '100 * hadoop_nodemanager_nodegpuutilization{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -874,47 +876,47 @@ local containersRow = { collapsed: false, }; -local containersStatePanel = { +local containersStatePanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_containerspaused{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"} > 0', + 'hadoop_nodemanager_containerspaused{' + matcher + '} > 0', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - paused', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_containerslaunched{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"} > 0', + 'hadoop_nodemanager_containerslaunched{' + matcher + '} > 0', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - launched', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_containerscompleted{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"} > 0', + 'hadoop_nodemanager_containerscompleted{' + matcher + '} > 0', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - completed', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_containersfailed{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"} > 0', + 'hadoop_nodemanager_containersfailed{' + matcher + '} > 0', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - failed', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_containerskilled{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"} > 0', + 'hadoop_nodemanager_containerskilled{' + matcher + '} > 0', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - killed', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_containersiniting{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"} > 0', + 'hadoop_nodemanager_containersiniting{' + matcher + '} > 0', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - initing', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_containersreiniting{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"} > 0', + 'hadoop_nodemanager_containersreiniting{' + matcher + '} > 0', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - reiniting', format='time_series', @@ -985,11 +987,11 @@ local containersStatePanel = { }, }; -local containersUsedMemoryPanel = { +local containersUsedMemoryPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_containerusedmemgb{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_containerusedmemgb{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -1060,11 +1062,11 @@ local containersUsedMemoryPanel = { }, }; -local containersUsedVirtualMemoryPanel = { +local containersUsedVirtualMemoryPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_containerusedvmemgb{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_containerusedvmemgb{' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -1135,17 +1137,17 @@ local containersUsedVirtualMemoryPanel = { }, }; -local containersAvailableMemoryPanel = { +local containersAvailableMemoryPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_availablegb{name="NodeManagerMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_availablegb{name="NodeManagerMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - available', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_allocatedgb{name="NodeManagerMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_allocatedgb{name="NodeManagerMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - allocated', format='time_series', @@ -1216,17 +1218,17 @@ local containersAvailableMemoryPanel = { }, }; -local containersAvailableVirtualCoresPanel = { +local containersAvailableVirtualCoresPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_nodemanager_availablevcores{name="NodeManagerMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_availablevcores{name="NodeManagerMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - available', format='time_series', ), prometheus.target( - 'hadoop_nodemanager_allocatedvcores{name="NodeManagerMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_nodemanager_allocatedvcores{name="NodeManagerMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}} - allocated', format='time_series', @@ -1297,13 +1299,13 @@ local containersAvailableVirtualCoresPanel = { }, }; -local nodemanagerLogsPanel = { +local nodemanagerLogsPanel(matcher) = { datasource: lokiDatasource, targets: [ { datasource: lokiDatasource, editorMode: 'code', - expr: '{job=~"$job", hadoop_cluster=~"$hadoop_cluster", instance=~"$instance", filename=~".*/hadoop/logs/.*-nodemanager.*.log"} |= ``', + expr: '{' + matcher + '} |= `` | (filename=~".*/hadoop/logs/.*-nodemanager.*.log" or log_type="nodemanager")', queryType: 'range', refId: 'A', }, @@ -1374,10 +1376,22 @@ local nodemanagerLogsPanel = { allValues='.+', sort=1 ), + template.new( + 'cluster', + promDatasource, + 'label_values(hadoop_nodemanager_availablegb{%(multiclusterSelector)s}, cluster)' % $._config, + label='Cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='.*', + hide=if $._config.enableMultiCluster then '' else 'variable', + sort=0 + ), template.new( 'instance', promDatasource, - 'label_values(hadoop_nodemanager_availablegb{job=~"$job"}, instance)', + 'label_values(hadoop_nodemanager_availablegb{%(hadoopSelector)s}, instance)' % $._config, label='Instance', refresh=2, includeAll=true, @@ -1388,7 +1402,7 @@ local nodemanagerLogsPanel = { template.new( 'hadoop_cluster', promDatasource, - 'label_values(hadoop_nodemanager_availablegb{job=~"$job"}, hadoop_cluster)', + 'label_values(hadoop_nodemanager_availablegb{%(hadoopSelector)s}, hadoop_cluster)' % $._config, label='Hadoop cluster', refresh=2, includeAll=true, @@ -1402,29 +1416,29 @@ local nodemanagerLogsPanel = { .addPanels( std.flattenArrays([ [ - applicationsRunningPanel { gridPos: { h: 6, w: 6, x: 0, y: 0 } }, - allocatedContainersPanel { gridPos: { h: 6, w: 6, x: 6, y: 0 } }, - containersLocalizationDurationPanel { gridPos: { h: 6, w: 6, x: 12, y: 0 } }, - containersLaunchDurationPanel { gridPos: { h: 6, w: 6, x: 18, y: 0 } }, + applicationsRunningPanel(getMatcher($._config)) { gridPos: { h: 6, w: 6, x: 0, y: 0 } }, + allocatedContainersPanel(getMatcher($._config)) { gridPos: { h: 6, w: 6, x: 6, y: 0 } }, + containersLocalizationDurationPanel(getMatcher($._config)) { gridPos: { h: 6, w: 6, x: 12, y: 0 } }, + containersLaunchDurationPanel(getMatcher($._config)) { gridPos: { h: 6, w: 6, x: 18, y: 0 } }, jvmRow { gridPos: { h: 1, w: 24, x: 0, y: 6 } }, - memoryUsedPanel { gridPos: { h: 6, w: 12, x: 0, y: 7 } }, - memoryCommittedPanel { gridPos: { h: 6, w: 12, x: 12, y: 7 } }, - garbageCollectionCountPanel { gridPos: { h: 6, w: 12, x: 0, y: 13 } }, - averageGarbageCollectionTimePanel { gridPos: { h: 6, w: 12, x: 12, y: 13 } }, + memoryUsedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 7 } }, + memoryCommittedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 7 } }, + garbageCollectionCountPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 13 } }, + averageGarbageCollectionTimePanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 13 } }, nodeRow { gridPos: { h: 1, w: 24, x: 0, y: 19 } }, - nodeMemoryUsedPanel { gridPos: { h: 6, w: 12, x: 0, y: 20 } }, - nodeMemoryCommittedPanel { gridPos: { h: 6, w: 12, x: 12, y: 20 } }, - nodeCPUUtilizationPanel { gridPos: { h: 6, w: 12, x: 0, y: 26 } }, - nodeGPUUtilizationPanel { gridPos: { h: 6, w: 12, x: 12, y: 26 } }, + nodeMemoryUsedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 20 } }, + nodeMemoryCommittedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 20 } }, + nodeCPUUtilizationPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 26 } }, + nodeGPUUtilizationPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 26 } }, containersRow { gridPos: { h: 1, w: 24, x: 0, y: 32 } }, - containersStatePanel { gridPos: { h: 6, w: 8, x: 0, y: 33 } }, - containersUsedMemoryPanel { gridPos: { h: 6, w: 8, x: 8, y: 33 } }, - containersUsedVirtualMemoryPanel { gridPos: { h: 6, w: 8, x: 16, y: 33 } }, - containersAvailableMemoryPanel { gridPos: { h: 6, w: 12, x: 0, y: 39 } }, - containersAvailableVirtualCoresPanel { gridPos: { h: 6, w: 12, x: 12, y: 39 } }, + containersStatePanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 0, y: 33 } }, + containersUsedMemoryPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 8, y: 33 } }, + containersUsedVirtualMemoryPanel(getMatcher($._config)) { gridPos: { h: 6, w: 8, x: 16, y: 33 } }, + containersAvailableMemoryPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 39 } }, + containersAvailableVirtualCoresPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 39 } }, ], if $._config.enableLokiLogs then [ - nodemanagerLogsPanel { gridPos: { h: 8, w: 24, x: 0, y: 45 } }, + nodemanagerLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 45 } }, ] else [], [ ], diff --git a/apache-hadoop-mixin/dashboards/hadoop-resourcemanager-overview.libsonnet b/apache-hadoop-mixin/dashboards/hadoop-resourcemanager-overview.libsonnet index 56a8ca452..2e194405d 100644 --- a/apache-hadoop-mixin/dashboards/hadoop-resourcemanager-overview.libsonnet +++ b/apache-hadoop-mixin/dashboards/hadoop-resourcemanager-overview.libsonnet @@ -8,6 +8,8 @@ local dashboardUid = 'apache-hadoop-resourcemanager-overview'; local promDatasourceName = 'prometheus_datasource'; local lokiDatasourceName = 'loki_datasource'; +local getMatcher(cfg) = '%(hadoopSelector)s, instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"' % cfg; + local promDatasource = { uid: '${%s}' % promDatasourceName, }; @@ -16,36 +18,36 @@ local lokiDatasource = { uid: '${%s}' % lokiDatasourceName, }; -local nodeManagersStatePanel = { +local nodeManagersStatePanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_resourcemanager_numactivenms{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="ClusterMetrics",}', + 'hadoop_resourcemanager_numactivenms{' + matcher + ', name="ClusterMetrics",}', datasource=promDatasource, legendFormat='active', ), prometheus.target( - 'hadoop_resourcemanager_numdecommissionednms{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="ClusterMetrics",}', + 'hadoop_resourcemanager_numdecommissionednms{' + matcher + ', name="ClusterMetrics",}', datasource=promDatasource, legendFormat='decommissioned', ), prometheus.target( - 'hadoop_resourcemanager_numlostnms{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="ClusterMetrics"}', + 'hadoop_resourcemanager_numlostnms{' + matcher + ', name="ClusterMetrics"}', datasource=promDatasource, legendFormat='lost', ), prometheus.target( - 'hadoop_resourcemanager_numunhealthynms{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="ClusterMetrics"}', + 'hadoop_resourcemanager_numunhealthynms{' + matcher + ', name="ClusterMetrics"}', datasource=promDatasource, legendFormat='healthy', ), prometheus.target( - 'hadoop_resourcemanager_numrebootednms{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="ClusterMetrics"}', + 'hadoop_resourcemanager_numrebootednms{' + matcher + ', name="ClusterMetrics"}', datasource=promDatasource, legendFormat='rebooted', ), prometheus.target( - 'hadoop_resourcemanager_numshutdownnms{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="ClusterMetrics"}', + 'hadoop_resourcemanager_numshutdownnms{' + matcher + ', name="ClusterMetrics"}', datasource=promDatasource, legendFormat='shutdown', ), @@ -103,36 +105,36 @@ local applicationsRow = { collapsed: false, }; -local applicationsStatePanel = { +local applicationsStatePanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_resourcemanager_appsrunning{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_appsrunning{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='running', ), prometheus.target( - 'hadoop_resourcemanager_appspending{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_appspending{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='pending', ), prometheus.target( - 'hadoop_resourcemanager_appskilled{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_appskilled{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='killed', ), prometheus.target( - 'hadoop_resourcemanager_appssubmitted{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_appssubmitted{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='submitted', ), prometheus.target( - 'hadoop_resourcemanager_appscompleted{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_appscompleted{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='completed', ), prometheus.target( - 'hadoop_resourcemanager_appsfailed{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_appsfailed{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='failed', ), @@ -206,17 +208,17 @@ local applicationsStatePanel = { }, }; -local availableMemoryPanel = { +local availableMemoryPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_resourcemanager_allocatedmb{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_allocatedmb{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='allocated', format='time_series', ), prometheus.target( - 'hadoop_resourcemanager_availablemb{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_availablemb{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='available', format='time_series', @@ -288,17 +290,17 @@ local availableMemoryPanel = { }, }; -local availableVirtualCoresPanel = { +local availableVirtualCoresPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_resourcemanager_availablevcores{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_availablevcores{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='available', format='time_series', ), prometheus.target( - 'hadoop_resourcemanager_allocatedvcores{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster", name="QueueMetrics",q0="root", q1="default"}', + 'hadoop_resourcemanager_allocatedvcores{' + matcher + ', name="QueueMetrics",q0="root", q1="default"}', datasource=promDatasource, legendFormat='allocated', format='time_series', @@ -384,17 +386,17 @@ local jvmRow = { collapsed: false, }; -local memoryUsedPanel = { +local memoryUsedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_resourcemanager_memheapusedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_resourcemanager_memheapusedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='heap', format='time_series', ), prometheus.target( - 'hadoop_resourcemanager_memnonheapusedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_resourcemanager_memnonheapusedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='nonheap', format='time_series', @@ -467,17 +469,17 @@ local memoryUsedPanel = { pluginVersion: '10.0.2-cloud.1.94a6f396', }; -local memoryCommittedPanel = { +local memoryCommittedPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'hadoop_resourcemanager_memheapcommittedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_resourcemanager_memheapcommittedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='heap', format='time_series', ), prometheus.target( - 'hadoop_resourcemanager_memnonheapcommittedm{name="JvmMetrics", job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}', + 'hadoop_resourcemanager_memnonheapcommittedm{name="JvmMetrics", ' + matcher + '}', datasource=promDatasource, legendFormat='nonheap', format='time_series', @@ -549,11 +551,11 @@ local memoryCommittedPanel = { }, }; -local garbageCollectionCountPanel = { +local garbageCollectionCountPanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_resourcemanager_gccount{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:])', + 'increase(hadoop_resourcemanager_gccount{' + matcher + '}[$__interval:])', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -626,11 +628,11 @@ local garbageCollectionCountPanel = { }, }; -local averageGarbageCollectionTimePanel = { +local averageGarbageCollectionTimePanel(matcher) = { datasource: promDatasource, targets: [ prometheus.target( - 'increase(hadoop_resourcemanager_gctimemillis{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:]) / clamp_min(increase(hadoop_resourcemanager_gccount{job=~"$job", instance=~"$instance", hadoop_cluster=~"$hadoop_cluster"}[$__interval:]), 1)', + 'increase(hadoop_resourcemanager_gctimemillis{' + matcher + '}[$__interval:]) / clamp_min(increase(hadoop_resourcemanager_gccount{' + matcher + '}[$__interval:]), 1)', datasource=promDatasource, legendFormat='{{hadoop_cluster}} - {{instance}}', format='time_series', @@ -703,13 +705,13 @@ local averageGarbageCollectionTimePanel = { }, }; -local resourcemanagerLogsPanel = { +local resourcemanagerLogsPanel(matcher) = { datasource: lokiDatasource, targets: [ { datasource: lokiDatasource, editorMode: 'code', - expr: '{job=~"$job", hadoop_cluster=~"$hadoop_cluster", instance=~"$instance", filename=~".*/hadoop/logs/.*-resourcemanager.*.log"} |= ``', + expr: '{' + matcher + '} |= `` | (filename=~".*/hadoop/logs/.*-resourcemanager.*.log" or log_type="resourcemanager")', queryType: 'range', refId: 'A', }, @@ -780,10 +782,22 @@ local resourcemanagerLogsPanel = { allValues='.+', sort=1 ), + template.new( + 'cluster', + promDatasource, + 'label_values(hadoop_resourcemanager_activeapplications{%(multiclusterSelector)s}, cluster)' % $._config, + label='Cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='.*', + hide=if $._config.enableMultiCluster then '' else 'variable', + sort=0 + ), template.new( 'instance', promDatasource, - 'label_values(hadoop_resourcemanager_activeapplications{job=~"$job"}, instance)', + 'label_values(hadoop_resourcemanager_activeapplications{%(hadoopSelector)s}, instance)' % $._config, label='Instance', refresh=2, includeAll=true, @@ -794,7 +808,7 @@ local resourcemanagerLogsPanel = { template.new( 'hadoop_cluster', promDatasource, - 'label_values(hadoop_resourcemanager_activeapplications{job=~"$job"}, hadoop_cluster)', + 'label_values(hadoop_resourcemanager_activeapplications{%(hadoopSelector)s}, hadoop_cluster)' % $._config, label='Hadoop cluster', refresh=2, includeAll=true, @@ -808,19 +822,19 @@ local resourcemanagerLogsPanel = { .addPanels( std.flattenArrays([ [ - nodeManagersStatePanel { gridPos: { h: 9, w: 24, x: 0, y: 0 } }, + nodeManagersStatePanel(getMatcher($._config)) { gridPos: { h: 9, w: 24, x: 0, y: 0 } }, applicationsRow { gridPos: { h: 1, w: 24, x: 0, y: 9 } }, - applicationsStatePanel { gridPos: { h: 8, w: 24, x: 0, y: 10 } }, - availableMemoryPanel { gridPos: { h: 6, w: 12, x: 0, y: 18 } }, - availableVirtualCoresPanel { gridPos: { h: 6, w: 12, x: 12, y: 18 } }, + applicationsStatePanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 10 } }, + availableMemoryPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 18 } }, + availableVirtualCoresPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 18 } }, jvmRow { gridPos: { h: 1, w: 24, x: 0, y: 24 } }, - memoryUsedPanel { gridPos: { h: 6, w: 12, x: 0, y: 25 } }, - memoryCommittedPanel { gridPos: { h: 6, w: 12, x: 12, y: 25 } }, - garbageCollectionCountPanel { gridPos: { h: 6, w: 12, x: 0, y: 31 } }, - averageGarbageCollectionTimePanel { gridPos: { h: 6, w: 12, x: 12, y: 31 } }, + memoryUsedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 25 } }, + memoryCommittedPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 25 } }, + garbageCollectionCountPanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 0, y: 31 } }, + averageGarbageCollectionTimePanel(getMatcher($._config)) { gridPos: { h: 6, w: 12, x: 12, y: 31 } }, ], if $._config.enableLokiLogs then [ - resourcemanagerLogsPanel { gridPos: { h: 8, w: 24, x: 0, y: 37 } }, + resourcemanagerLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 37 } }, ] else [], [ ],