Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: otel prometheus update #198

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion k8s/operator/manifests/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ rules:
resources: ["events"]
verbs: ["create"]
- apiGroups: ["rbac.authorization.k8s.io"]
resources: ["clusterroles", "clusterrolebindings"]
resources: ["clusterroles", "clusterrolebindings", "roles", "rolebindings"]
verbs: ["create", "get", "patch"]
- apiGroups: ["keramik.3box.io"]
resources: ["networks", "networks/status", "simulations", "simulations/status"]
Expand Down
139 changes: 21 additions & 118 deletions operator/src/monitoring/opentelemetry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use k8s_openapi::{
PodSpec, PodTemplateSpec, ResourceRequirements, ServicePort, ServiceSpec, Volume,
VolumeMount,
},
rbac::v1::{ClusterRole, ClusterRoleBinding, PolicyRule, RoleRef, Subject},
rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject},
},
apimachinery::pkg::{
api::resource::Quantity,
Expand All @@ -27,16 +27,16 @@ use crate::{
resource_limits::ResourceLimitsConfig,
},
utils::{
apply_account, apply_cluster_role, apply_cluster_role_binding, apply_config_map,
apply_account, apply_config_map, apply_namespaced_role, apply_namespaced_role_binding,
apply_service, apply_stateful_set, Clock, Context,
},
};

pub const OTEL_APP: &str = "otel";
pub const OTEL_SERVICE_NAME: &str = "otel";

pub const OTEL_CR_BINDING: &str = "monitoring-cluster-role-binding";
pub const OTEL_CR: &str = "monitoring-cluster-role";
pub const OTEL_ROLE_BINDING: &str = "monitoring-role-binding";
pub const OTEL_ROLE: &str = "monitoring-role";
pub const OTEL_ACCOUNT: &str = "monitoring-service-account";

pub const OTEL_CONFIG_MAP_NAME: &str = "otel-config";
Expand All @@ -52,12 +52,13 @@ pub async fn apply(
orefs: &[OwnerReference],
) -> Result<(), kube::error::Error> {
apply_account(cx.clone(), ns, orefs.to_vec(), OTEL_ACCOUNT).await?;
apply_cluster_role(cx.clone(), ns, orefs.to_vec(), OTEL_CR, cluster_role()).await?;
apply_cluster_role_binding(
apply_namespaced_role(cx.clone(), ns, orefs.to_vec(), OTEL_ROLE, namespace_role()).await?;
apply_namespaced_role_binding(
cx.clone(),
ns,
orefs.to_vec(),
OTEL_CR_BINDING,
cluster_role_binding(ns),
OTEL_ROLE_BINDING,
role_binding(ns),
)
.await?;
apply_config_map(
Expand Down Expand Up @@ -172,7 +173,7 @@ fn stateful_set_spec(config: &OtelConfig) -> StatefulSetSpec {
}),
containers: vec![Container {
name: "opentelemetry".to_owned(),
image: Some("public.ecr.aws/r5b3e0r5/3box/otelcol".to_owned()),
image: Some("otel/opentelemetry-collector-contrib:0.104.0".to_owned()),
args: Some(vec!["--config=/config/otel-config.yaml".to_owned()]),
ports: Some(vec![
ContainerPort {
Expand Down Expand Up @@ -257,8 +258,13 @@ fn stateful_set_spec(config: &OtelConfig) -> StatefulSetSpec {
}
}

fn cluster_role() -> ClusterRole {
ClusterRole {
fn config_map_data() -> BTreeMap<String, String> {
let config_str = include_str!("./otel-config.yaml"); // Adjust the path as necessary
BTreeMap::from_iter(vec![("otel-config.yaml".to_owned(), config_str.to_owned())])
}

fn namespace_role() -> Role {
Role {
rules: Some(vec![PolicyRule {
api_groups: Some(vec!["".to_owned()]),
resources: Some(vec!["pods".to_owned()]),
Expand All @@ -269,11 +275,11 @@ fn cluster_role() -> ClusterRole {
}
}

fn cluster_role_binding(ns: &str) -> ClusterRoleBinding {
ClusterRoleBinding {
fn role_binding(ns: &str) -> RoleBinding {
RoleBinding {
role_ref: RoleRef {
kind: "ClusterRole".to_owned(),
name: OTEL_CR.to_owned(),
kind: "Role".to_owned(),
name: OTEL_ROLE.to_owned(),
api_group: "rbac.authorization.k8s.io".to_owned(),
},
subjects: Some(vec![Subject {
Expand All @@ -285,106 +291,3 @@ fn cluster_role_binding(ns: &str) -> ClusterRoleBinding {
..Default::default()
}
}

fn config_map_data() -> BTreeMap<String, String> {
// Include a config that will scrape pods in the network
BTreeMap::from_iter(vec![(
"otel-config.yaml".to_owned(),
r#"---
receivers:
# Push based metrics
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
# Pull based metrics
prometheus:
config:
scrape_configs:
- job_name: 'kubernetes-service-endpoints'
scrape_interval: 10s
scrape_timeout: 1s

kubernetes_sd_configs:
- role: pod

# Only container ports named `metrics` will be considered valid targets.
#
# Setup relabel rules to give meaning to the following k8s annotations:
# prometheus/path - URL path of the metrics endpoint
#
# Example:
# annotations:
# prometheus/path: "/api/v0/metrics"
relabel_configs:
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: "metrics"
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod
- source_labels: [__meta_kubernetes_pod_container_name]
action: replace
target_label: kubernetes_container

processors:
batch:

exporters:
# This is unused but can be easily added for debugging.
logging:
# can be one of detailed | normal | basic
verbosity: detailed
# Log all messages, do not sample
sampling_initial: 1
sampling_thereafter: 1
otlp/jaeger:
endpoint: jaeger:4317
tls:
insecure: true
prometheus:
endpoint: 0.0.0.0:9464
# Keep stale metrics around for 1h before dropping
# This helps as simulation metrics are stale once the simulation stops.
metric_expiration: 1h
resource_to_telemetry_conversion:
enabled: true
prometheus/simulation:
endpoint: 0.0.0.0:9465
# Keep stale metrics around for 1h before dropping
# This helps as simulation metrics are stale once the simulation stops.
metric_expiration: 1h
resource_to_telemetry_conversion:
enabled: true

service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [otlp/jaeger]
metrics:
receivers: [otlp,prometheus]
processors: [batch]
exporters: [prometheus]
metrics/simulation:
receivers: [otlp]
processors: [batch]
exporters: [prometheus/simulation]
# Enable telemetry on the collector itself
telemetry:
logs:
level: info
metrics:
level: detailed
address: 0.0.0.0:8888"#
.to_owned(),
)])
}
82 changes: 82 additions & 0 deletions operator/src/monitoring/otel-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
---
receivers:
# Push based metrics
otlp:
samika98 marked this conversation as resolved.
Show resolved Hide resolved
protocols:
grpc:
endpoint: 0.0.0.0:4317
# Pull based metrics
prometheus/scrape_configs:
config:
scrape_configs:
- job_name: 'kubernetes-service-endpoints'
scrape_interval: 10s
scrape_timeout: 1s

kubernetes_sd_configs:
- role: pod
namespaces:
own_namespace: true
# Only container ports named `metrics` will be considered valid targets.
#
# Setup relabel rules to give meaning to the following k8s annotations:
# prometheus/path - URL path of the metrics endpoint
#
# Example:
# annotations:
# prometheus/path: "/api/v0/metrics"
relabel_configs:
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: "metrics"
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod
- source_labels: [__meta_kubernetes_pod_container_name]
action: replace
target_label: kubernetes_container

processors:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we filter out duplicates while processing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can if necessary, see this issue about the situation.

open-telemetry/opentelemetry-collector-contrib#14900

batch:

exporters:
# This is unused but can be easily added for debugging.
logging:
# can be one of detailed | normal | basic
verbosity: detailed
# Log all messages, do not sample
sampling_initial: 1
sampling_thereafter: 1
prometheus/endpoint:
endpoint: 0.0.0.0:9464
# Keep stale metrics around for 1h before dropping
# This helps as simulation metrics are stale once the simulation stops.
metric_expiration: 1h
resource_to_telemetry_conversion:
enabled: true
# Remote write to prometheus
prometheusremotewrite:
endpoint: "http://prometheus:9090/api/v1/write"
tls:
insecure: true

service:
pipelines:
metrics:
receivers: [otlp,prometheus/scrape_configs]
processors: [batch]
exporters: [prometheus/endpoint,prometheusremotewrite]
# Enable telemetry on the collector itself
telemetry:
logs:
level: debug
metrics:
level: detailed
address: 0.0.0.0:8888
4 changes: 4 additions & 0 deletions operator/src/monitoring/prom-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
global:
scrape_interval: 10s
scrape_timeout: 5s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These seem a bit aggressive to me. Do we need near real-time metrics for our use cases? Can we scrape every 30 seconds to a minute?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just pulled these setting along from the existing config.
Want to drop them down to the defaults?

# How frequently to scrape targets by default.
  [ scrape_interval: [<duration>](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#duration) | default = 1m ]

  # How long until a scrape request times out.
  [ scrape_timeout: [<duration>](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#duration) | default = 10s ]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't have a reason to scrape aggressively then we can save resources by setting defaults


54 changes: 32 additions & 22 deletions operator/src/monitoring/prometheus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,27 @@ use k8s_openapi::{
apps::v1::StatefulSetSpec,
core::v1::{
ConfigMapVolumeSource, Container, ContainerPort, PodSpec, PodTemplateSpec,
ResourceRequirements, Volume, VolumeMount,
ResourceRequirements, ServicePort, ServiceSpec, Volume, VolumeMount,
},
},
apimachinery::pkg::{
api::resource::Quantity,
apis::meta::v1::ObjectMeta,
apis::meta::v1::{LabelSelector, OwnerReference},
apis::meta::v1::{LabelSelector, ObjectMeta, OwnerReference},
util::intstr::IntOrString,
},
};
use rand::RngCore;

use crate::{
network::{ipfs_rpc::IpfsRpcClient, resource_limits::ResourceLimitsConfig},
utils::{apply_config_map, apply_stateful_set, Clock, Context},
utils::{apply_config_map, apply_service, apply_stateful_set, Clock, Context},
};

use crate::labels::selector_labels;

pub const PROM_APP: &str = "prometheus";
pub const PROM_CONFIG_MAP_NAME: &str = "prom-config";
pub const PROM_SERVICE_NAME: &str = "prometheus";

pub struct PrometheusConfig {
pub dev_mode: bool,
Expand All @@ -44,6 +45,14 @@ pub async fn apply(
config_map_data(),
)
.await?;
apply_service(
cx.clone(),
ns,
orefs.to_vec(),
PROM_SERVICE_NAME,
service_spec(),
)
.await?;
apply_stateful_set(
cx.clone(),
ns,
Expand Down Expand Up @@ -79,6 +88,21 @@ fn resource_requirements(dev_mode: bool) -> ResourceRequirements {
}
}

fn service_spec() -> ServiceSpec {
ServiceSpec {
ports: Some(vec![ServicePort {
name: Some("prometheus".to_owned()),
port: 9090,
protocol: Some("TCP".to_owned()),
target_port: Some(IntOrString::Int(9090)),
..Default::default()
}]),
selector: selector_labels(PROM_APP),
type_: Some("ClusterIP".to_owned()),
..Default::default()
}
}

fn stateful_set_spec(dev_mode: bool) -> StatefulSetSpec {
StatefulSetSpec {
replicas: Some(1),
Expand All @@ -94,10 +118,11 @@ fn stateful_set_spec(dev_mode: bool) -> StatefulSetSpec {
spec: Some(PodSpec {
containers: vec![Container {
name: "prometheus".to_owned(),
image: Some("prom/prometheus:v2.42.0".to_owned()),
image: Some("prom/prometheus:v2.45.6".to_owned()),
command: Some(vec![
"/bin/prometheus".to_owned(),
"--web.enable-lifecycle".to_owned(),
"--web.enable-remote-write-receiver".to_owned(),
"--config.file=/config/prom-config.yaml".to_owned(),
]),
ports: Some(vec![ContainerPort {
Expand Down Expand Up @@ -132,22 +157,7 @@ fn stateful_set_spec(dev_mode: bool) -> StatefulSetSpec {
}

fn config_map_data() -> BTreeMap<String, String> {
BTreeMap::from_iter(vec![(
"prom-config.yaml".to_owned(),
r#"
global:
scrape_interval: 10s
scrape_timeout: 5s
let config_str = include_str!("./prom-config.yaml");

scrape_configs:
- job_name: services
metrics_path: /metrics
honor_labels: true
static_configs:
- targets:
- 'localhost:9090'
- 'otel:9464'
- 'otel:8888'"#
.to_owned(),
)])
BTreeMap::from_iter(vec![("prom-config.yaml".to_owned(), config_str.to_owned())])
}
Loading
Loading