Merge pull request #66 from ai-cfia/65-custom-dockerfile-for-instrume…

…ntation-stack Issue #65: push new dockerfiles
ai-cfia · Nov 1, 2024 · 5b03300 · 5b03300
2 parents 7d3a5e6 + da3ee07
commit 5b03300
Show file tree

Hide file tree

Showing 10 changed files with 279 additions and 0 deletions.
diff --git a/.github/workflows/custom-dockerfile-push.yml b/.github/workflows/custom-dockerfile-push.yml
@@ -13,6 +13,10 @@ on:
           - otel-auto
           - webtop-fedora-kde
           - webtop-ubuntu-kde
+          - alloy
+          - tempo
+          - loki
+          - prometheus
       tag:
         required: true
         description: Version to tag the image

diff --git a/dockerfiles/alloy/Dockerfile b/dockerfiles/alloy/Dockerfile
@@ -0,0 +1,6 @@
+FROM grafana/alloy:latest
+
+COPY config.alloy /etc/alloy/config.alloy
+COPY endpoints.json /etc/alloy/endpoints.json
+
+CMD ["run", "--server.http.listen-addr=0.0.0.0:12345", "--stability.level=public-preview", "/etc/alloy/config.alloy"]
diff --git a/dockerfiles/alloy/config.alloy b/dockerfiles/alloy/config.alloy
@@ -0,0 +1,87 @@
+// Load endpoint credentials and options
+local.file "endpoints" {
+    filename = "/etc/alloy/endpoints.json"
+}
+
+// Metrics scraping configuration
+prometheus.scrape "infrastructure" {
+    targets = [
+        {"__address__" = "prometheus:9090", group = "infrastructure", service = "prometheus"},
+        {"__address__" = "tempo:3200", group = "infrastructure", service = "tempo"},
+        {"__address__" = "loki:3100", group = "infrastructure", service = "loki"},
+        {"__address__" = "grafana:3000", group = "infrastructure", service = "grafana"},
+    ]
+    scrape_interval = "15s"
+    forward_to = [prometheus.remote_write.default.receiver]
+}
+
+// OTLP Receiver for OpenTelemetry data
+otelcol.receiver.otlp "default" {
+    grpc { }
+    http { }
+
+    output {
+        metrics = [otelcol.exporter.prometheus.default.input]
+        logs    = [otelcol.exporter.loki.default.input]
+        traces  = [otelcol.exporter.otlp.tempo.input]
+    }
+}
+
+// Memory Limiter Processor to manage memory
+otelcol.processor.memory_limiter "default" {
+    check_interval = "1s"
+    limit          = "1GiB"
+    output {
+        metrics = [otelcol.processor.batch.default.input]
+        logs    = [otelcol.processor.batch.default.input]
+        traces  = [otelcol.processor.batch.default.input]
+    }
+}
+
+// Batch Processor for batching trace data
+otelcol.processor.batch "default" {
+    output {
+        metrics = [otelcol.exporter.prometheus.default.input]
+        logs    = [otelcol.exporter.loki.default.input]
+        traces  = [otelcol.exporter.otlp.tempo.input]
+    }
+}
+
+// Logging configuration
+logging {
+    level  = "info"
+    format = "logfmt"
+}
+
+// Loki Exporter for logs
+otelcol.exporter.loki "default" {
+    forward_to = [loki.write.default.receiver]
+}
+
+// Write logs to the local Loki instance
+loki.write "default" {
+    endpoint {
+        url = "http://loki:3100/loki/api/v1/push"
+    }
+}
+
+// Tempo Exporter for trace data
+otelcol.exporter.otlp "tempo" {
+    client {
+        endpoint = "http://tempo:4317"
+        tls {
+            insecure = true
+        }
+    }
+}
+
+otelcol.exporter.prometheus "default" {
+    forward_to = [prometheus.remote_write.default.receiver]
+}
+
+// Remote write configuration to Prometheus instance
+prometheus.remote_write "default" {
+    endpoint {
+        url = "http://prometheus:9090/api/v1/write"
+    }
+}
diff --git a/dockerfiles/alloy/endpoints.json b/dockerfiles/alloy/endpoints.json
@@ -0,0 +1,24 @@
+{
+    "metrics": {
+        "url": "http://prometheus:9009/api/v1/push",
+        "basicAuth": {
+            "username": "",
+            "password": ""
+        }
+    },
+    "logs": {
+        "url": "http://loki:3100/loki/api/v1/push",
+        "basicAuth": {
+            "username": "",
+            "password": ""
+        }
+    },
+    "traces": {
+        "url": "http://tempo:4317",
+        "basicAuthToken": "",
+        "tls": {
+            "insecure": true,
+            "insecureSkipVerify": true
+        }
+    }
+}
diff --git a/dockerfiles/loki/Dockerfile b/dockerfiles/loki/Dockerfile
@@ -0,0 +1,5 @@
+FROM grafana/loki:latest
+
+COPY loki.yaml /etc/loki/loki.yaml
+
+CMD ["--pattern-ingester.enabled=true", "-config.file=/etc/loki/loki.yaml"]
diff --git a/dockerfiles/loki/loki.yaml b/dockerfiles/loki/loki.yaml
@@ -0,0 +1,36 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+
+common:
+  path_prefix: /data/loki
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+ingester:
+  wal:
+    enabled: false
+
+schema_config:
+  configs:
+    - from: "2023-01-05"
+      index:
+        period: 24h
+        prefix: index_
+      object_store: filesystem
+      schema: v13
+      store: tsdb
+
+storage_config:
+  filesystem:
+    directory: /data/loki/chunks
+  tsdb_shipper:
+    active_index_directory: /data/loki/tsdb-index
+    cache_location: /data/loki/tsdb-cache
+
+limits_config:
+  reject_old_samples: true
+  reject_old_samples_max_age: 168h
diff --git a/dockerfiles/prometheus/Dockerfile b/dockerfiles/prometheus/Dockerfile
@@ -0,0 +1,5 @@
+FROM prom/prometheus:latest
+
+COPY prometheus.yaml /etc/prometheus/prometheus.yml
+
+CMD ["--config.file=/etc/prometheus/prometheus.yml", "--web.enable-remote-write-receiver"]
diff --git a/dockerfiles/prometheus/prometheus.yaml b/dockerfiles/prometheus/prometheus.yaml
@@ -0,0 +1,13 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'backend'
+    metrics_path: '/metrics'
+    static_configs:
+      - targets: ['backend:5000']
diff --git a/dockerfiles/tempo/Dockerfile b/dockerfiles/tempo/Dockerfile
@@ -0,0 +1,5 @@
+FROM grafana/tempo:latest
+
+COPY tempo.yaml /etc/tempo.yaml
+
+CMD ["-config.file=/etc/tempo.yaml"]
diff --git a/dockerfiles/tempo/tempo.yaml b/dockerfiles/tempo/tempo.yaml
@@ -0,0 +1,94 @@
+# For more information on this configuration, see the complete reference guide at
+# https://grafana.com/docs/tempo/latest/configuration/
+
+# Enables result streaming from Tempo (to Grafana) via HTTP.
+stream_over_http_enabled: true
+
+# Configure the server block.
+server:
+  # Listen for all incoming requests on port 3200.
+  http_listen_port: 3200
+
+# The distributor receives incoming trace span data for the system.
+distributor:
+  receivers:             # This configuration will listen on all ports and protocols that tempo is capable of.
+    jaeger:              # The receivers all come from the OpenTelemetry collector.  More configuration information can
+      protocols:         # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
+        thrift_http:     #
+        grpc:            # For a production deployment you should only enable the receivers you need!
+        thrift_binary:   #
+        thrift_compact:
+    otlp:
+      protocols:
+        http:
+        grpc:            # This example repository only utilises the OTLP gRPC receiver on port 4317.
+    zipkin:              # Receive trace data in any supported Zipkin format.
+
+# The ingester receives data from the distributor and processes it into indices and blocks.
+ingester:
+  trace_idle_period: 10s       # The length of time after a trace has not received spans to consider it complete and flush it.
+  max_block_bytes: 1_000_000   # Cut the head block when it hits this size or
+  max_block_duration: 5m       # this much time passes
+
+# The compactor block configures the compactor responsible for compacting TSDB blocks.
+compactor:
+  compaction:
+    compaction_window: 1h              # Blocks in this time window will be compacted together.
+    max_block_bytes: 100_000_000       # Maximum size of a compacted block.
+    block_retention: 1h                # How long to keep blocks. Default is 14 days, this demo system is short-lived.
+    compacted_block_retention: 10m     # How long to keep compacted blocks stored elsewhere.
+
+# Configuration block to determine where to store TSDB blocks.
+storage:
+  trace:
+    backend: local                     # Use the local filesystem for block storage. Not recommended for production systems.
+    block:
+      bloom_filter_false_positive: .05 # Bloom filter false positive rate.  lower values create larger filters but fewer false positives.
+    # Write Ahead Log (WAL) configuration.
+    wal:
+      path: /tmp/tempo/wal             # Directory to store the the WAL locally.
+    # Local configuration for filesystem storage.
+    local:
+      path: /tmp/tempo/blocks          # Directory to store the TSDB blocks.
+    # Pool used for finding trace IDs.
+    pool:
+      max_workers: 100                 # Worker pool determines the number of parallel requests to the object store backend.
+      queue_depth: 10000               # Maximum depth for the querier queue jobs. A job is required for each block searched.
+
+# Configures the metrics generator component of Tempo.
+metrics_generator:
+  # Specifies which processors to use.
+  processor:
+    # Span metrics create metrics based on span type, duration, name and service.
+    span_metrics:
+        # Configure extra dimensions to add as metric labels.
+        dimensions:
+          - http.method
+          - http.target
+          - http.status_code
+          - service.version
+    # Service graph metrics create node and edge metrics for determinng service interactions.
+    service_graphs:
+        # Configure extra dimensions to add as metric labels.
+        dimensions:
+          - http.method
+          - http.target
+          - http.status_code
+          - service.version
+    # Configure the local blocks processor.
+    local_blocks:
+        # Ensure that metrics blocks are flushed to storage so TraceQL metrics queries against historical data.
+        flush_to_storage: true
+  # The registry configuration determines how to process metrics.
+  registry:
+    collection_interval: 5s
+    external_labels:
+      source: tempo
+  storage:
+    path: /tmp/tempo/generator/wal
+  traces_storage:
+    path: /tmp/tempo/generator/traces
+
+# Global override configuration.
+overrides:
+  metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant.