diff --git a/.gitignore b/.gitignore index 2b9cef78..f8be6f3f 100644 --- a/.gitignore +++ b/.gitignore @@ -129,7 +129,7 @@ docs/_build /services/monitoring/pgsql_query_exporter_config.yaml /services/monitoring/docker-compose.yml /services/monitoring/smokeping_prober_config.yaml - +services/monitoring/tempo_config.yaml # Simcore: Contains location of repo.config file on the machine and of the whole config directory .config.location diff --git a/Makefile b/Makefile index 13fd07bc..2e80bf48 100644 --- a/Makefile +++ b/Makefile @@ -71,7 +71,6 @@ down-maintenance: ## Stop the maintenance mode fi \ ,) - # Misc: info & clean .PHONY: info info-vars info-local info: ## Displays some important info diff --git a/services/jaeger/opentelemetry-collector-config.yaml b/services/jaeger/opentelemetry-collector-config.yaml index 112fea27..e8398d7e 100644 --- a/services/jaeger/opentelemetry-collector-config.yaml +++ b/services/jaeger/opentelemetry-collector-config.yaml @@ -8,11 +8,15 @@ receivers: exporters: otlphttp: endpoint: ${TRACING_OPENTELEMETRY_COLLECTOR_EXPORTER_ENDPOINT} # Adjust to your Jaeger endpoint + otlp: + endpoint: http://tempo:4317 + tls: + insecure: true service: pipelines: traces: receivers: [otlp] - exporters: [otlphttp] + exporters: [otlphttp,otlp] processors: [batch,probabilistic_sampler,filter/drop_healthcheck] telemetry: logs: diff --git a/services/monitoring/Makefile b/services/monitoring/Makefile index 74349eac..de49798f 100644 --- a/services/monitoring/Makefile +++ b/services/monitoring/Makefile @@ -9,6 +9,16 @@ REPO_BASE_DIR := $(abspath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))../..) # TARGETS -------------------------------------------------- include ${REPO_BASE_DIR}/scripts/common.Makefile +define create-s3-bucket + # ensure bucket is available in S3... + @set -o allexport; \ + source .env; \ + echo Creating bucket "$${TEMPO_S3_BUCKET}";\ + ${REPO_BASE_DIR}/scripts/create-s3-bucket.bash "$${TEMPO_S3_BUCKET}" && \ + set +o allexport; \ + # bucket is available in S3 +endef + .PHONY: up up: .init .env config.prometheus ${TEMP_COMPOSE} ## Deploys or updates current stack "$(STACK_NAME)". If MONITORED_NETWORK is not specified, it will create an attachable network @docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE} $(STACK_NAME) @@ -16,6 +26,7 @@ up: .init .env config.prometheus ${TEMP_COMPOSE} ## Deploys or updates current s .PHONY: up-local up-local: .init .env config.prometheus.simcore ${TEMP_COMPOSE}-local ## Deploys or updates current stack "$(STACK_NAME)". If MONITORED_NETWORK is not specified, it will create an attachable network + @$(create-s3-bucket) @docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-local $(STACK_NAME) $(MAKE) grafana-import @@ -49,28 +60,28 @@ up-master: .init .env config.monitoring config.prometheus.ceph.simcore ${TEMP_C @docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-master ${STACK_NAME} $(MAKE) grafana-import -${TEMP_COMPOSE}: docker-compose.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}: docker-compose.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< > $@ -${TEMP_COMPOSE}-letsencrypt-http: docker-compose.yml docker-compose.letsencrypt.http.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}-letsencrypt-http: docker-compose.yml docker-compose.letsencrypt.http.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< docker-compose.letsencrypt.http.yml > $@ -${TEMP_COMPOSE}-letsencrypt-dns: docker-compose.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}-letsencrypt-dns: docker-compose.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< docker-compose.letsencrypt.dns.yml > $@ -${TEMP_COMPOSE}-dalco: docker-compose.yml docker-compose.dalco.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}-dalco: docker-compose.yml docker-compose.dalco.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< docker-compose.dalco.yml > $@ -${TEMP_COMPOSE}-public: docker-compose.yml docker-compose.public.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}-public: docker-compose.yml docker-compose.public.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< docker-compose.public.yml > $@ -${TEMP_COMPOSE}-aws: docker-compose.yml docker-compose.aws.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}-aws: docker-compose.yml docker-compose.aws.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< docker-compose.aws.yml > $@ -${TEMP_COMPOSE}-master: docker-compose.yml docker-compose.master.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}-master: docker-compose.yml docker-compose.master.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< docker-compose.master.yml > $@ -${TEMP_COMPOSE}-local: docker-compose.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml +${TEMP_COMPOSE}-local: docker-compose.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml tempo_config.yaml @${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< docker-compose.letsencrypt.dns.yml > $@ docker-compose.yml: docker-compose.yml.j2 .env .venv pgsql_query_exporter_config.yaml @@ -137,6 +148,9 @@ pgsql_query_exporter_config.yaml: pgsql_query_exporter_config.yaml.j2 ${REPO_CON smokeping_prober_config.yaml: smokeping_prober_config.yaml.j2 ${REPO_CONFIG_LOCATION} .env .venv $(call jinja, $<, .env, $@); +tempo_config.yaml: tempo_config.yaml.j2 ${REPO_CONFIG_LOCATION} .env .venv + $(call jinja, $<, .env, $@); + .PHONY: grafana/assets grafana/assets: ${REPO_CONFIG_LOCATION} @$(MAKE_C) grafana assets diff --git a/services/monitoring/docker-compose.yml.j2 b/services/monitoring/docker-compose.yml.j2 index a19739ae..973babef 100644 --- a/services/monitoring/docker-compose.yml.j2 +++ b/services/monitoring/docker-compose.yml.j2 @@ -17,6 +17,8 @@ networks: configs: alertmanager_config: file: ./alertmanager/config.yml + tempo_config: + file: ./tempo_config.yaml node_exporter_entrypoint: file: ./node-exporter/docker-entrypoint.sh prometheus_config: @@ -398,3 +400,27 @@ services: reservations: memory: 32M cpus: "0.1" + tempo: + image: grafana/tempo:2.6.1 + command: "-target=scalable-single-binary -config.file=/etc/tempo.yaml" + configs: + - source: tempo_config + target: /etc/tempo.yaml + networks: + - monitored + deploy: + labels: + - traefik.enable=true + - traefik.docker.network=${PUBLIC_NETWORK} + - traefik.http.services.tempo.loadbalancer.server.port=9095 + - traefik.http.routers.tempo.rule=Host(`${MONITORING_DOMAIN}`) && PathPrefix(`/tempo`) + - traefik.http.routers.tempo.priority=10 + - traefik.http.routers.tempo.entrypoints=https + - traefik.http.routers.tempo.tls=true + - traefik.http.middlewares.tempo_replace_regex.replacepathregex.regex=^/tempo/?(.*)$$ + - traefik.http.middlewares.tempo_replace_regex.replacepathregex.replacement=/$${1} + - traefik.http.routers.tempo.middlewares=ops_whitelist_ips@swarm, ops_gzip@swarm, tempo_replace_regex + resources: + limits: + memory: 2000M + cpus: "2.0" diff --git a/services/monitoring/grafana/terraform/datasources.tf b/services/monitoring/grafana/terraform/datasources.tf index ecfd9686..62a5ab38 100644 --- a/services/monitoring/grafana/terraform/datasources.tf +++ b/services/monitoring/grafana/terraform/datasources.tf @@ -15,3 +15,11 @@ resource "grafana_data_source" "prometheuscatchall" { is_default = false uid = "RmZEr52nz" } + +resource "grafana_data_source" "tempo" { + type = "tempo" + name = "tempo" + url = var.TEMPO_URL + basic_auth_enabled = false + is_default = false +} diff --git a/services/monitoring/grafana/terraform/main.tf.j2 b/services/monitoring/grafana/terraform/main.tf.j2 index 0e1bd6f0..914b5fd0 100644 --- a/services/monitoring/grafana/terraform/main.tf.j2 +++ b/services/monitoring/grafana/terraform/main.tf.j2 @@ -17,10 +17,10 @@ terraform { skip_credentials_validation = true skip_requesting_account_id = true skip_metadata_api_check = true - skip_region_validation = true - skip_s3_checksum = true + skip_region_validation = true + skip_s3_checksum = true use_path_style = true - endpoints = { + endpoints = { s3 = "{{ GRAFANA_TERRAFORM_STATE_BACKEND_S3_ENDPOINT }}" } {% endif %} diff --git a/services/monitoring/grafana/terraform/variables.tf b/services/monitoring/grafana/terraform/variables.tf index fd1d8ed7..23cedf1d 100644 --- a/services/monitoring/grafana/terraform/variables.tf +++ b/services/monitoring/grafana/terraform/variables.tf @@ -2,6 +2,10 @@ variable "GRAFANA_URL" { description = "grafana_url" sensitive = false } +variable "TEMPO_URL" { + description = "tempo_url" + sensitive = false +} variable "GRAFANA_AUTH" { description = "Username:Password" sensitive = true diff --git a/services/monitoring/template.env b/services/monitoring/template.env index d2f502d6..2e3a9251 100644 --- a/services/monitoring/template.env +++ b/services/monitoring/template.env @@ -21,3 +21,9 @@ MONITORING_PROMETHEUS_PGSQL_GID_MONITORED=${MONITORING_PROMETHEUS_PGSQL_GID_MONI MONITORING_PROMETHEUS_SMOKEPING_TARGETS=${MONITORING_PROMETHEUS_SMOKEPING_TARGETS} PUBLIC_NETWORK=${PUBLIC_NETWORK} MONITORED_NETWORK=${MONITORED_NETWORK} +TEMPO_S3_BUCKET=${TEMPO_S3_BUCKET} +STORAGE_DOMAIN=${STORAGE_DOMAIN} +S3_REGION=${S3_REGION} +S3_ACCESS_KEY=${S3_ACCESS_KEY} +S3_SECRET_KEY=${S3_SECRET_KEY} +TF_VAR_PROMETHEUS_CATCHALL_URL=${TF_VAR_PROMETHEUS_CATCHALL_URL} diff --git a/services/monitoring/tempo_config.yaml.j2 b/services/monitoring/tempo_config.yaml.j2 new file mode 100644 index 00000000..eae0ed03 --- /dev/null +++ b/services/monitoring/tempo_config.yaml.j2 @@ -0,0 +1,52 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: # this configuration will listen on all ports and protocols that tempo is capable of. + otlp: + protocols: + http: + grpc: + +#ingester: +# max_block_duration: 5m # cut the headblock when this much time passes. this should probably be left alone normally + +compactor: + compaction: + block_retention: 96h # overall Tempo trace retention. + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: {{ MACHINE_FQDN }} + storage: + path: /var/tempo/generator/wal + remote_write: + - url: {{ TF_VAR_PROMETHEUS_CATCHALL_URL }}/api/v1/write + +storage: + trace: + backend: s3 # backend configuration to use + wal: + path: /var/tempo/wal # where to store the wal locally + s3: + bucket: {{ TEMPO_S3_BUCKET }} # how to store data in s3 + endpoint: {{STORAGE_DOMAIN}} + region: {{S3_REGION}} + access_key: {{S3_ACCESS_KEY}} + secret_key: {{S3_SECRET_KEY}} + insecure: false + tls_insecure_skip_verify: true + # For using AWS, select the appropriate regional endpoint and region + # endpoint: s3.dualstack.us-west-2.amazonaws.com + # region: us-west-2 + +querier: + frontend_worker: + frontend_address: localhost:9095 + +overrides: + defaults: + metrics_generator: + processors: ['service-graphs', 'span-metrics']