Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add: Prometheus Federation 🚧 #439

Merged
merged 10 commits into from
Nov 17, 2023
Merged
20 changes: 3 additions & 17 deletions scripts/deployments/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ minio_enabled=1
start_simcore=0
start_opsstack=0
stack_target=local
without_deploy_agent=1

usage="$(basename "$0") [-h] [--key=value]

Expand All @@ -61,8 +60,7 @@ where keys are:
--minio_enabled (default: ${minio_enabled})
--start_opsstack (default: ${start_opsstack})
--stack_target (default: ${stack_target})
--vcs_check (default: ${vcs_check})
--without_deploy_agent (default: ${without_deploy_agent})"
--vcs_check (default: ${vcs_check})"

for i in "$@"; do
case $i in # Infos on bash case statements https://linuxize.com/post/bash-case-statement/
Expand All @@ -86,10 +84,6 @@ for i in "$@"; do
vcs_check="${i#*=}"
;;
##
--without_deploy_agent=*)
without_deploy_agent="${i#*=}"
;;
##
:|--help|-h)
echo "$usage" && exit 0
shift
Expand Down Expand Up @@ -266,16 +260,8 @@ if [ "$start_opsstack" -eq 0 ]; then
popd
fi
if [ "$start_simcore" -eq 0 ]; then
if [ "$without_deploy_agent" -eq 0 ]; then
log_info "starting simcore without deployment agent..."
"${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash
else
# -------------------------------- DEPlOYMENT-AGENT -------------------------------
log_info "starting deployment-agent for simcore..."
pushd "${repo_basedir}"/services/deployment-agent;
make down up-"$stack_target";
popd
fi
log_info "starting simcore..."
"${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash
fi
# shellcheck disable=2235

Expand Down
56 changes: 53 additions & 3 deletions services/monitoring/docker-compose.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ volumes:
grafana_data: {}
alertmanager_data: {}
prometheus_data_cadvisor: {}
prometheus_data_federation: {}

networks:
internal:
Expand All @@ -26,6 +27,8 @@ configs:
file: ./prometheus/prometheus.yml
prometheus_config_cadvisor:
file: ./prometheus/prometheus-cadvisor.yml
prometheus_config_federation:
file: ./prometheus/prometheus-federation.yml
prometheus_rules:
file: ./prometheus/prometheus.rules.yml
grafana_image_renderer_config:
Expand All @@ -37,7 +40,7 @@ configs:
services:
prometheuscatchall:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.46.0
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
Expand Down Expand Up @@ -82,10 +85,56 @@ services:
reservations:
memory: 2048M
cpus: "0.2"

prometheusfederation:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data_federation:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
user: root # only user root can use the docker socket
configs:
- source: prometheus_config_federation
target: /etc/prometheus/prometheus.yml
- source: prometheus_rules
target: /etc/prometheus/prometheus.rules.yml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=${MONITORING_PROMETHEUS_FEDERATION_RETENTION}"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--web.external-url=https://${MONITORING_DOMAIN}/prometheusfederation/"
- "--web.route-prefix=/"
- "--storage.tsdb.allow-overlapping-blocks" # via https://jessicagreben.medium.com/prometheus-fill-in-data-for-new-recording-rules-30a14ccb8467
#- "--web.enable-admin-api" This allows messing with prometheus using its API from the CLI. Disabled for security reasons by default.
networks:
- internal
- monitored
- public
extra_hosts: []
deploy:
labels:
- traefik.enable=true
- traefik.docker.network=${PUBLIC_NETWORK}
# direct access through port
- traefik.http.services.prometheusfederation.loadbalancer.server.port=${MONITORING_PROMETHEUS_PORT}
- traefik.http.routers.prometheusfederation.rule=Host(`${MONITORING_DOMAIN}`) && PathPrefix(`/prometheusfederation`)
- traefik.http.routers.prometheusfederation.entrypoints=https
- traefik.http.routers.prometheusfederation.tls=true
- traefik.http.middlewares.prometheusfederation_stripprefixregex.stripprefixregex.regex=^/prometheusfederation
- traefik.http.routers.prometheusfederation.middlewares=ops_whitelist_ips@docker, ops_auth@docker, ops_gzip@docker, prometheusfederation_stripprefixregex
- prometheus-job=prometheusfederation
- prometheus-port=${MONITORING_PROMETHEUS_PORT}
resources:
limits:
memory: 4096M
cpus: "2"
reservations:
memory: 2048M
cpus: "0.2"
prometheuscadvisor:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.46.0
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data_cadvisor:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
Expand All @@ -111,6 +160,7 @@ services:
- public
extra_hosts: []
deploy:
replicas: 0
labels:
- traefik.enable=true
- traefik.docker.network=${PUBLIC_NETWORK}
Expand Down
19 changes: 19 additions & 0 deletions services/monitoring/prometheus/prometheus-federation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
global:
scrape_interval: 29s # Set the scrape interval to every 29 seconds. Default is every 1 minute.
evaluation_interval: 29s # Evaluate rules every 29 seconds. The default is every 1 minute.

scrape_configs:
- job_name: 'federate' # A job defines a series of targets and parameters describing how to scrape them.
scrape_interval: 29s # Overwrite the global scrape interval for this job, set to every 29 seconds.
honor_labels: true # Do not overwrite labels in scraped data.
scheme: http
metrics_path: '/federate' # Path to fetch the metrics from, '/federate' is for federation.

params:
'match[]': # The match[] parameter is used to select the metrics to retrieve. Logical OR is used between match[] parameters.
- '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 'http_'
- '{__name__=~"up"}' # Regex to select the metric 'up'
- '{job="dy-sidecar-exporter"}'

static_configs:
- targets: ['prometheuscatchall:9090'] # The targets section is where you specify the host and port of the Prometheus instances to scrape.
2 changes: 1 addition & 1 deletion services/monitoring/prometheus/prometheus-simcore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ scrape_configs:
target_label: service_name
replacement: $1
action: replace
# Handle with "tasks." when there are multiple replica accrding to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/
# Handle with "tasks." when there are multiple replica according to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/
dns_sd_configs:
- names:
- "tasks.production_webserver"
Expand Down
1 change: 1 addition & 0 deletions services/monitoring/template.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
MONITORING_DOMAIN=${MONITORING_DOMAIN}
MONITORING_PROMETHEUS_RETENTION=${MONITORING_PROMETHEUS_RETENTION}
MONITORING_PROMETHEUS_FEDERATION_RETENTION=${MONITORING_PROMETHEUS_FEDERATION_RETENTION}
mrnicegyu11 marked this conversation as resolved.
Show resolved Hide resolved
MONITORING_PROMETHEUS_PORT=${MONITORING_PROMETHEUS_PORT}
MONITORED_STACK_NAMES=${MONITORED_STACK_NAMES}
MONITORED_POSTGRES_PASSWORDS=${MONITORED_POSTGRES_PASSWORDS}
Expand Down
Loading