3box · 3benbox · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/k8s/operator/manifests/operator.yaml b/k8s/operator/manifests/operator.yaml
@@ -33,7 +33,7 @@ rules:
     resources: ["events"]
     verbs: ["create"]
   - apiGroups: ["rbac.authorization.k8s.io"]
-    resources: ["clusterroles", "clusterrolebindings"]
+    resources: ["clusterroles", "clusterrolebindings", "roles", "rolebindings"]
     verbs: ["create", "get", "patch"]
   - apiGroups: ["keramik.3box.io"]
     resources: ["networks", "networks/status", "simulations", "simulations/status"]

diff --git a/operator/src/monitoring/opentelemetry.rs b/operator/src/monitoring/opentelemetry.rs
@@ -9,7 +9,7 @@ use k8s_openapi::{
             PodSpec, PodTemplateSpec, ResourceRequirements, ServicePort, ServiceSpec, Volume,
             VolumeMount,
         },
-        rbac::v1::{ClusterRole, ClusterRoleBinding, PolicyRule, RoleRef, Subject},
+        rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject},
     },
     apimachinery::pkg::{
         api::resource::Quantity,
@@ -27,16 +27,16 @@ use crate::{
         resource_limits::ResourceLimitsConfig,
     },
     utils::{
-        apply_account, apply_cluster_role, apply_cluster_role_binding, apply_config_map,
+        apply_account, apply_config_map, apply_namespaced_role, apply_namespaced_role_binding,
         apply_service, apply_stateful_set, Clock, Context,
     },
 };
 
 pub const OTEL_APP: &str = "otel";
 pub const OTEL_SERVICE_NAME: &str = "otel";
 
-pub const OTEL_CR_BINDING: &str = "monitoring-cluster-role-binding";
-pub const OTEL_CR: &str = "monitoring-cluster-role";
+pub const OTEL_ROLE_BINDING: &str = "monitoring-role-binding";
+pub const OTEL_ROLE: &str = "monitoring-role";
 pub const OTEL_ACCOUNT: &str = "monitoring-service-account";
 
 pub const OTEL_CONFIG_MAP_NAME: &str = "otel-config";
@@ -52,12 +52,13 @@ pub async fn apply(
     orefs: &[OwnerReference],
 ) -> Result<(), kube::error::Error> {
     apply_account(cx.clone(), ns, orefs.to_vec(), OTEL_ACCOUNT).await?;
-    apply_cluster_role(cx.clone(), ns, orefs.to_vec(), OTEL_CR, cluster_role()).await?;
-    apply_cluster_role_binding(
+    apply_namespaced_role(cx.clone(), ns, orefs.to_vec(), OTEL_ROLE, namespace_role()).await?;
+    apply_namespaced_role_binding(
         cx.clone(),
+        ns,
         orefs.to_vec(),
-        OTEL_CR_BINDING,
-        cluster_role_binding(ns),
+        OTEL_ROLE_BINDING,
+        role_binding(ns),
     )
     .await?;
     apply_config_map(
@@ -172,7 +173,7 @@ fn stateful_set_spec(config: &OtelConfig) -> StatefulSetSpec {
                 }),
                 containers: vec![Container {
                     name: "opentelemetry".to_owned(),
-                    image: Some("public.ecr.aws/r5b3e0r5/3box/otelcol".to_owned()),
+                    image: Some("otel/opentelemetry-collector-contrib:0.104.0".to_owned()),
                     args: Some(vec!["--config=/config/otel-config.yaml".to_owned()]),
                     ports: Some(vec![
                         ContainerPort {
@@ -257,8 +258,13 @@ fn stateful_set_spec(config: &OtelConfig) -> StatefulSetSpec {
     }
 }
 
-fn cluster_role() -> ClusterRole {
-    ClusterRole {
+fn config_map_data() -> BTreeMap<String, String> {
+    let config_str = include_str!("./otel-config.yaml"); // Adjust the path as necessary
+    BTreeMap::from_iter(vec![("otel-config.yaml".to_owned(), config_str.to_owned())])
+}
+
+fn namespace_role() -> Role {
+    Role {
         rules: Some(vec![PolicyRule {
             api_groups: Some(vec!["".to_owned()]),
             resources: Some(vec!["pods".to_owned()]),
@@ -269,11 +275,11 @@ fn cluster_role() -> ClusterRole {
     }
 }
 
-fn cluster_role_binding(ns: &str) -> ClusterRoleBinding {
-    ClusterRoleBinding {
+fn role_binding(ns: &str) -> RoleBinding {
+    RoleBinding {
         role_ref: RoleRef {
-            kind: "ClusterRole".to_owned(),
-            name: OTEL_CR.to_owned(),
+            kind: "Role".to_owned(),
+            name: OTEL_ROLE.to_owned(),
             api_group: "rbac.authorization.k8s.io".to_owned(),
         },
         subjects: Some(vec![Subject {
@@ -285,106 +291,3 @@ fn cluster_role_binding(ns: &str) -> ClusterRoleBinding {
         ..Default::default()
     }
 }
-
-fn config_map_data() -> BTreeMap<String, String> {
-    // Include a config that will scrape pods in the network
-    BTreeMap::from_iter(vec![(
-        "otel-config.yaml".to_owned(),
-        r#"---
-receivers:
-  # Push based metrics
-  otlp:
-    protocols:
-      grpc:
-        endpoint: 0.0.0.0:4317
-  # Pull based metrics
-  prometheus:
-    config:
-      scrape_configs:
-        - job_name: 'kubernetes-service-endpoints'
-          scrape_interval: 10s
-          scrape_timeout: 1s
-
-          kubernetes_sd_configs:
-          - role: pod
-
-          # Only container ports named `metrics` will be considered valid targets.
-          #
-          # Setup relabel rules to give meaning to the following k8s annotations:
-          #   prometheus/path - URL path of the metrics endpoint
-          #
-          # Example:
-          #   annotations:
-          #      prometheus/path: "/api/v0/metrics"
-          relabel_configs:
-          - source_labels: [__meta_kubernetes_pod_container_port_name]
-            action: keep
-            regex: "metrics"
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels: [__meta_kubernetes_namespace]
-            action: replace
-            target_label: kubernetes_namespace
-          - source_labels: [__meta_kubernetes_pod_name]
-            action: replace
-            target_label: kubernetes_pod
-          - source_labels: [__meta_kubernetes_pod_container_name]
-            action: replace
-            target_label: kubernetes_container
-
-processors:
-  batch:
-
-exporters:
-  # This is unused but can be easily added for debugging.
-  logging:
-    # can be one of detailed | normal | basic
-    verbosity: detailed
-    # Log all messages, do not sample
-    sampling_initial: 1
-    sampling_thereafter: 1
-  otlp/jaeger:
-    endpoint: jaeger:4317
-    tls:
-      insecure: true
-  prometheus:
-    endpoint: 0.0.0.0:9464
-    # Keep stale metrics around for 1h before dropping
-    # This helps as simulation metrics are stale once the simulation stops.
-    metric_expiration: 1h
-    resource_to_telemetry_conversion:
-      enabled: true
-  prometheus/simulation:
-    endpoint: 0.0.0.0:9465
-    # Keep stale metrics around for 1h before dropping
-    # This helps as simulation metrics are stale once the simulation stops.
-    metric_expiration: 1h
-    resource_to_telemetry_conversion:
-      enabled: true
-
-service:
-  pipelines:
-    traces:
-      receivers: [otlp]
-      processors: [batch]
-      exporters: [otlp/jaeger]
-    metrics:
-      receivers: [otlp,prometheus]
-      processors: [batch]
-      exporters: [prometheus]
-    metrics/simulation:
-      receivers: [otlp]
-      processors: [batch]
-      exporters: [prometheus/simulation]
-  # Enable telemetry on the collector itself
-  telemetry:
-    logs:
-      level: info
-    metrics:
-      level: detailed
-      address: 0.0.0.0:8888"#
-            .to_owned(),
-    )])
-}
diff --git a/operator/src/monitoring/otel-config.yaml b/operator/src/monitoring/otel-config.yaml
@@ -0,0 +1,82 @@
+---
+receivers:
+  # Push based metrics
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+  # Pull based metrics
+  prometheus/scrape_configs:
+    config:
+      scrape_configs:
+        - job_name: 'kubernetes-service-endpoints'
+          scrape_interval: 10s
+          scrape_timeout: 1s
+
+          kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              own_namespace: true
+          # Only container ports named `metrics` will be considered valid targets.
+          #
+          # Setup relabel rules to give meaning to the following k8s annotations:
+          #   prometheus/path - URL path of the metrics endpoint
+          #
+          # Example:
+          #   annotations:
+          #      prometheus/path: "/api/v0/metrics"
+          relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_container_port_name]
+            action: keep
+            regex: "metrics"
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod
+          - source_labels: [__meta_kubernetes_pod_container_name]
+            action: replace
+            target_label: kubernetes_container
+
+processors:
+  batch:
+
+exporters:
+  # This is unused but can be easily added for debugging.
+  logging:
+    # can be one of detailed | normal | basic
+    verbosity: detailed
+    # Log all messages, do not sample
+    sampling_initial: 1
+    sampling_thereafter: 1
+  prometheus/endpoint:
+    endpoint: 0.0.0.0:9464
+    # Keep stale metrics around for 1h before dropping
+    # This helps as simulation metrics are stale once the simulation stops.
+    metric_expiration: 1h
+    resource_to_telemetry_conversion:
+      enabled: true
+  # Remote write to prometheus
+  prometheusremotewrite:
+    endpoint: "http://prometheus:9090/api/v1/write"
+    tls:
+      insecure: true
+
+service:
+  pipelines:
+    metrics:
+      receivers: [otlp,prometheus/scrape_configs]
+      processors: [batch]
+      exporters: [prometheus/endpoint,prometheusremotewrite]
+  # Enable telemetry on the collector itself
+  telemetry:
+    logs:
+      level: debug
+    metrics:
+      level: detailed
+      address: 0.0.0.0:8888
diff --git a/operator/src/monitoring/prom-config.yaml b/operator/src/monitoring/prom-config.yaml
@@ -0,0 +1,4 @@
+global:
+  scrape_interval: 10s
+  scrape_timeout: 5s
+
diff --git a/operator/src/monitoring/prometheus.rs b/operator/src/monitoring/prometheus.rs
@@ -5,26 +5,27 @@ use k8s_openapi::{
         apps::v1::StatefulSetSpec,
         core::v1::{
             ConfigMapVolumeSource, Container, ContainerPort, PodSpec, PodTemplateSpec,
-            ResourceRequirements, Volume, VolumeMount,
+            ResourceRequirements, ServicePort, ServiceSpec, Volume, VolumeMount,
         },
     },
     apimachinery::pkg::{
         api::resource::Quantity,
-        apis::meta::v1::ObjectMeta,
-        apis::meta::v1::{LabelSelector, OwnerReference},
+        apis::meta::v1::{LabelSelector, ObjectMeta, OwnerReference},
+        util::intstr::IntOrString,
     },
 };
 use rand::RngCore;
 
 use crate::{
     network::{ipfs_rpc::IpfsRpcClient, resource_limits::ResourceLimitsConfig},
-    utils::{apply_config_map, apply_stateful_set, Clock, Context},
+    utils::{apply_config_map, apply_service, apply_stateful_set, Clock, Context},
 };
 
 use crate::labels::selector_labels;
 
 pub const PROM_APP: &str = "prometheus";
 pub const PROM_CONFIG_MAP_NAME: &str = "prom-config";
+pub const PROM_SERVICE_NAME: &str = "prometheus";
 
 pub struct PrometheusConfig {
     pub dev_mode: bool,
@@ -44,6 +45,14 @@ pub async fn apply(
         config_map_data(),
     )
     .await?;
+    apply_service(
+        cx.clone(),
+        ns,
+        orefs.to_vec(),
+        PROM_SERVICE_NAME,
+        service_spec(),
+    )
+    .await?;
     apply_stateful_set(
         cx.clone(),
         ns,
@@ -79,6 +88,21 @@ fn resource_requirements(dev_mode: bool) -> ResourceRequirements {
     }
 }
 
+fn service_spec() -> ServiceSpec {
+    ServiceSpec {
+        ports: Some(vec![ServicePort {
+            name: Some("prometheus".to_owned()),
+            port: 9090,
+            protocol: Some("TCP".to_owned()),
+            target_port: Some(IntOrString::Int(9090)),
+            ..Default::default()
+        }]),
+        selector: selector_labels(PROM_APP),
+        type_: Some("ClusterIP".to_owned()),
+        ..Default::default()
+    }
+}
+
 fn stateful_set_spec(dev_mode: bool) -> StatefulSetSpec {
     StatefulSetSpec {
         replicas: Some(1),
@@ -94,10 +118,11 @@ fn stateful_set_spec(dev_mode: bool) -> StatefulSetSpec {
             spec: Some(PodSpec {
                 containers: vec![Container {
                     name: "prometheus".to_owned(),
-                    image: Some("prom/prometheus:v2.42.0".to_owned()),
+                    image: Some("prom/prometheus:v2.45.6".to_owned()),
                     command: Some(vec![
                         "/bin/prometheus".to_owned(),
                         "--web.enable-lifecycle".to_owned(),
+                        "--web.enable-remote-write-receiver".to_owned(),
                         "--config.file=/config/prom-config.yaml".to_owned(),
                     ]),
                     ports: Some(vec![ContainerPort {
@@ -132,22 +157,7 @@ fn stateful_set_spec(dev_mode: bool) -> StatefulSetSpec {
 }
 
 fn config_map_data() -> BTreeMap<String, String> {
-    BTreeMap::from_iter(vec![(
-        "prom-config.yaml".to_owned(),
-        r#"
-        global:
-          scrape_interval: 10s
-          scrape_timeout: 5s
+    let config_str = include_str!("./prom-config.yaml");
 
-        scrape_configs:
-          - job_name: services
-            metrics_path: /metrics
-            honor_labels: true
-            static_configs:
-              - targets:
-                - 'localhost:9090'
-                - 'otel:9464'
-                - 'otel:8888'"#
-            .to_owned(),
-    )])
+    BTreeMap::from_iter(vec![("prom-config.yaml".to_owned(), config_str.to_owned())])
 }