Skip to content

Commit bbd0854

Browse files
xeniaperazvan
andauthored
chore: add built-in Prometheus support (#955)
* chore: replace jmx exporter with built in Prometheus support * add changelog entry * test for metrics being present * add monitoring documentation * metrics port on pods/service configurable * add jmx exporter back in * adjust changelog entry * Update docs/modules/zookeeper/pages/usage_guide/monitoring.adoc Co-authored-by: Razvan-Daniel Mihai <[email protected]> * adjust documentation --------- Co-authored-by: Razvan-Daniel Mihai <[email protected]>
1 parent 0bbc3a9 commit bbd0854

File tree

5 files changed

+122
-12
lines changed

5 files changed

+122
-12
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
1010
- Use `--file-log-max-files` (or `FILE_LOG_MAX_FILES`) to limit the number of log files kept.
1111
- Use `--file-log-rotation-period` (or `FILE_LOG_ROTATION_PERIOD`) to configure the frequency of rotation.
1212
- Use `--console-log-format` (or `CONSOLE_LOG_FORMAT`) to set the format to `plain` (default) or `json`.
13+
- Add built-in Prometheus support and expose metrics on `/metrics` path of `native-metrics` port ([#955]).
1314

1415
### Changed
1516

@@ -45,6 +46,7 @@ All notable changes to this project will be documented in this file.
4546
[#942]: https://github.com/stackabletech/zookeeper-operator/pull/942
4647
[#946]: https://github.com/stackabletech/zookeeper-operator/pull/946
4748
[#950]: https://github.com/stackabletech/zookeeper-operator/pull/950
49+
[#955]: https://github.com/stackabletech/zookeeper-operator/pull/955
4850

4951
## [25.3.0] - 2025-03-21
5052

docs/modules/zookeeper/pages/usage_guide/monitoring.adoc

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,23 @@
22
:description: The managed ZooKeeper instances are automatically configured to export Prometheus metrics.
33

44
The managed ZooKeeper instances are automatically configured to export Prometheus metrics.
5-
See xref:operators:monitoring.adoc[] for more details.
5+
See xref:operators:monitoring.adoc[window=_blank] for more details.
6+
7+
Depending on the SDP version, different ZooKeeper monitoring systems are used to produce metrics. Currently, JMX in combination with JMX Exporter
8+
is used, but will be removed in a later release. Starting with SDP 25.7 the built-in Prometheus support of ZooKeeper is also added.
9+
The naming of the metrics differs between the two systems.
10+
11+
== Metrics
12+
13+
Starting with SDP 25.7 ZooKeeper is configured to export metrics using the built-in Prometheus provider. More on the Prometheus provider in
14+
the https://zookeeper.apache.org/doc/current/zookeeperMonitor.html[ZooKeeper Monitor Guide,window=_blank].
15+
16+
The configuration is located in the `zoo.cfg`:
17+
18+
[source,properties]
19+
----
20+
metricsProvider.className=org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider
21+
metricsProvider.httpPort=7000
22+
----
23+
24+
The metrics can be accessed by calling the `/metrics` endpoint on the specified port.

rust/operator-binary/src/crd/mod.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ pub const ZOOKEEPER_PROPERTIES_FILE: &str = "zoo.cfg";
4848
pub const JVM_SECURITY_PROPERTIES_FILE: &str = "security.properties";
4949

5050
pub const METRICS_PORT: u16 = 9505;
51+
pub const METRICS_PROVIDER_HTTP_PORT_KEY: &str = "metricsProvider.httpPort";
52+
pub const METRICS_PROVIDER_HTTP_PORT: u16 = 7000;
5153

5254
pub const STACKABLE_DATA_DIR: &str = "/stackable/data";
5355
pub const STACKABLE_CONFIG_DIR: &str = "/stackable/config";
@@ -468,6 +470,16 @@ impl Configuration for v1alpha1::ZookeeperConfigFragment {
468470
v1alpha1::ZookeeperConfig::DATA_DIR.to_string(),
469471
Some(STACKABLE_DATA_DIR.to_string()),
470472
);
473+
result.insert(
474+
"metricsProvider.className".to_string(),
475+
Some(
476+
"org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider".to_string(),
477+
),
478+
);
479+
result.insert(
480+
METRICS_PROVIDER_HTTP_PORT_KEY.to_string(),
481+
Some(METRICS_PROVIDER_HTTP_PORT.to_string()),
482+
);
471483
}
472484

473485
Ok(result)

rust/operator-binary/src/zk_controller.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ use crate::{
7272
config::jvm::{construct_non_heap_jvm_args, construct_zk_server_heap_env},
7373
crd::{
7474
DOCKER_IMAGE_BASE_NAME, JVM_SECURITY_PROPERTIES_FILE, MAX_PREPARE_LOG_FILE_SIZE,
75-
MAX_ZK_LOG_FILES_SIZE, STACKABLE_CONFIG_DIR, STACKABLE_DATA_DIR, STACKABLE_LOG_CONFIG_DIR,
76-
STACKABLE_LOG_DIR, STACKABLE_RW_CONFIG_DIR, ZOOKEEPER_PROPERTIES_FILE, ZookeeperRole,
75+
MAX_ZK_LOG_FILES_SIZE, METRICS_PROVIDER_HTTP_PORT, METRICS_PROVIDER_HTTP_PORT_KEY,
76+
STACKABLE_CONFIG_DIR, STACKABLE_DATA_DIR, STACKABLE_LOG_CONFIG_DIR, STACKABLE_LOG_DIR,
77+
STACKABLE_RW_CONFIG_DIR, ZOOKEEPER_PROPERTIES_FILE, ZookeeperRole,
7778
security::{self, ZookeeperSecurity},
7879
v1alpha1,
7980
},
@@ -415,6 +416,7 @@ pub async fn reconcile_zk(
415416
&rolegroup,
416417
&resolved_product_image,
417418
&zookeeper_security,
419+
rolegroup_config,
418420
)?;
419421
let rg_configmap = build_server_rolegroup_config_map(
420422
zk,
@@ -675,6 +677,7 @@ fn build_server_rolegroup_service(
675677
rolegroup: &RoleGroupRef<v1alpha1::ZookeeperCluster>,
676678
resolved_product_image: &ResolvedProductImage,
677679
zookeeper_security: &ZookeeperSecurity,
680+
rolegroup_config: &HashMap<PropertyNameKind, BTreeMap<String, String>>,
678681
) -> Result<Service> {
679682
let prometheus_label =
680683
Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?;
@@ -716,6 +719,12 @@ fn build_server_rolegroup_service(
716719
protocol: Some("TCP".to_string()),
717720
..ServicePort::default()
718721
},
722+
ServicePort {
723+
name: Some("native-metrics".to_string()),
724+
port: metrics_port_from_rolegroup_config(rolegroup_config).into(),
725+
protocol: Some("TCP".to_string()),
726+
..ServicePort::default()
727+
},
719728
]),
720729
selector: Some(service_selector_labels.into()),
721730
publish_not_ready_addresses: Some(true),
@@ -898,6 +907,10 @@ fn build_server_rolegroup_statefulset(
898907
.add_container_port("zk-leader", 2888)
899908
.add_container_port("zk-election", 3888)
900909
.add_container_port("metrics", 9505)
910+
.add_container_port(
911+
"native-metrics",
912+
metrics_port_from_rolegroup_config(server_config).into(),
913+
)
901914
.add_volume_mount("data", STACKABLE_DATA_DIR)
902915
.context(AddVolumeMountSnafu)?
903916
.add_volume_mount("config", STACKABLE_CONFIG_DIR)
@@ -1063,6 +1076,27 @@ fn build_server_rolegroup_statefulset(
10631076
})
10641077
}
10651078

1079+
fn metrics_port_from_rolegroup_config(
1080+
rolegroup_config: &HashMap<PropertyNameKind, BTreeMap<String, String>>,
1081+
) -> u16 {
1082+
let metrics_port = rolegroup_config
1083+
.get(&PropertyNameKind::File(
1084+
ZOOKEEPER_PROPERTIES_FILE.to_string(),
1085+
))
1086+
.expect("{ZOOKEEPER_PROPERTIES_FILE} is present")
1087+
.get(METRICS_PROVIDER_HTTP_PORT_KEY)
1088+
.expect("{METRICS_PROVIDER_HTTP_PORT_KEY} is set");
1089+
1090+
match u16::from_str(metrics_port) {
1091+
Ok(port) => port,
1092+
Err(err) => {
1093+
tracing::error!("{err}");
1094+
tracing::info!("Defaulting to using {METRICS_PROVIDER_HTTP_PORT} as metrics port.");
1095+
METRICS_PROVIDER_HTTP_PORT
1096+
}
1097+
}
1098+
}
1099+
10661100
pub fn error_policy(
10671101
_obj: Arc<DeserializeGuard<v1alpha1::ZookeeperCluster>>,
10681102
error: &Error,

tests/templates/kuttl/smoke/test_zookeeper.py

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import requests
44
import time
55
import sys
6+
67
sys.tracebacklimit = 0
78

89

@@ -37,17 +38,29 @@ def check_ruok(hosts):
3738
url = host + ":8080/commands/" + cmd_ruok
3839
response = try_get(url).json()
3940

40-
if "command" in response and response["command"] == cmd_ruok \
41-
and "error" in response and response["error"] is None:
41+
if (
42+
"command" in response
43+
and response["command"] == cmd_ruok
44+
and "error" in response
45+
and response["error"] is None
46+
):
4247
continue
4348
else:
44-
print("Error[" + cmd_ruok + "] for [" + url + "]: received " + str(
45-
response) + " - expected {'command': 'ruok', 'error': None} ")
49+
print(
50+
"Error["
51+
+ cmd_ruok
52+
+ "] for ["
53+
+ url
54+
+ "]: received "
55+
+ str(response)
56+
+ " - expected {'command': 'ruok', 'error': None} "
57+
)
4658
exit(-1)
4759

4860

4961
def check_monitoring(hosts):
5062
for host in hosts:
63+
# test for the jmx exporter metrics
5164
url = host + ":9505"
5265
response = try_get(url)
5366

@@ -57,16 +70,46 @@ def check_monitoring(hosts):
5770
print("Error for [" + url + "]: could not access monitoring")
5871
exit(-1)
5972

73+
# test for the native metrics
74+
url = host + ":7000/metrics"
75+
response = try_get(url)
76+
77+
if response.ok:
78+
# arbitrary metric was chosen to test if metrics are present in the response
79+
if "quorum_size" in response.text:
80+
continue
81+
else:
82+
print("Error for [" + url + "]: missing metrics")
83+
exit(-1)
84+
continue
85+
else:
86+
print("Error for [" + url + "]: could not access monitoring")
87+
exit(-1)
88+
6089

61-
if __name__ == '__main__':
90+
if __name__ == "__main__":
6291
all_args = argparse.ArgumentParser(description="Test ZooKeeper.")
63-
all_args.add_argument("-n", "--namespace", help="The namespace to run in", required=True)
92+
all_args.add_argument(
93+
"-n", "--namespace", help="The namespace to run in", required=True
94+
)
6495
args = vars(all_args.parse_args())
6596
namespace = args["namespace"]
6697

67-
host_primary_0 = "http://test-zk-server-primary-0.test-zk-server-primary." + namespace + ".svc.cluster.local"
68-
host_primary_1 = "http://test-zk-server-primary-1.test-zk-server-primary." + namespace + ".svc.cluster.local"
69-
host_secondary = "http://test-zk-server-secondary-0.test-zk-server-secondary." + namespace + ".svc.cluster.local"
98+
host_primary_0 = (
99+
"http://test-zk-server-primary-0.test-zk-server-primary."
100+
+ namespace
101+
+ ".svc.cluster.local"
102+
)
103+
host_primary_1 = (
104+
"http://test-zk-server-primary-1.test-zk-server-primary."
105+
+ namespace
106+
+ ".svc.cluster.local"
107+
)
108+
host_secondary = (
109+
"http://test-zk-server-secondary-0.test-zk-server-secondary."
110+
+ namespace
111+
+ ".svc.cluster.local"
112+
)
70113

71114
hosts = [host_primary_0, host_primary_1, host_secondary]
72115

0 commit comments

Comments
 (0)