Skip to content

Commit

Permalink
✨ Add: Prometheus Federation 🚧 (#439)
Browse files Browse the repository at this point in the history
* Fix typo

* Add federated prometheus, bump prometheus minor version

* Scale cadvisor resource monitoring prometheus to zero

* minor fixes

* Rename and purge prometheus rules, add common prefixes s4l_ and osparc_

* fix typo

* fix typo

---------

Co-authored-by: kaiser <[email protected]>
Co-authored-by: Dustin Kaiser <[email protected]>
  • Loading branch information
3 people authored Nov 17, 2023
1 parent b794623 commit dae66c4
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 64 deletions.
20 changes: 3 additions & 17 deletions scripts/deployments/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ minio_enabled=1
start_simcore=0
start_opsstack=0
stack_target=local
without_deploy_agent=1

usage="$(basename "$0") [-h] [--key=value]
Expand All @@ -61,8 +60,7 @@ where keys are:
--minio_enabled (default: ${minio_enabled})
--start_opsstack (default: ${start_opsstack})
--stack_target (default: ${stack_target})
--vcs_check (default: ${vcs_check})
--without_deploy_agent (default: ${without_deploy_agent})"
--vcs_check (default: ${vcs_check})"

for i in "$@"; do
case $i in # Infos on bash case statements https://linuxize.com/post/bash-case-statement/
Expand All @@ -86,10 +84,6 @@ for i in "$@"; do
vcs_check="${i#*=}"
;;
##
--without_deploy_agent=*)
without_deploy_agent="${i#*=}"
;;
##
:|--help|-h)
echo "$usage" && exit 0
shift
Expand Down Expand Up @@ -266,16 +260,8 @@ if [ "$start_opsstack" -eq 0 ]; then
popd
fi
if [ "$start_simcore" -eq 0 ]; then
if [ "$without_deploy_agent" -eq 0 ]; then
log_info "starting simcore without deployment agent..."
"${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash
else
# -------------------------------- DEPlOYMENT-AGENT -------------------------------
log_info "starting deployment-agent for simcore..."
pushd "${repo_basedir}"/services/deployment-agent;
make down up-"$stack_target";
popd
fi
log_info "starting simcore..."
"${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash
fi
# shellcheck disable=2235

Expand Down
56 changes: 53 additions & 3 deletions services/monitoring/docker-compose.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ volumes:
grafana_data: {}
alertmanager_data: {}
prometheus_data_cadvisor: {}
prometheus_data_federation: {}

networks:
internal:
Expand All @@ -26,6 +27,8 @@ configs:
file: ./prometheus/prometheus.yml
prometheus_config_cadvisor:
file: ./prometheus/prometheus-cadvisor.yml
prometheus_config_federation:
file: ./prometheus/prometheus-federation.yml
prometheus_rules:
file: ./prometheus/prometheus.rules.yml
grafana_image_renderer_config:
Expand All @@ -37,7 +40,7 @@ configs:
services:
prometheuscatchall:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.46.0
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
Expand Down Expand Up @@ -82,10 +85,56 @@ services:
reservations:
memory: 2048M
cpus: "0.2"

prometheusfederation:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data_federation:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
user: root # only user root can use the docker socket
configs:
- source: prometheus_config_federation
target: /etc/prometheus/prometheus.yml
- source: prometheus_rules
target: /etc/prometheus/prometheus.rules.yml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=${MONITORING_PROMETHEUS_FEDERATION_RETENTION}"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--web.external-url=https://${MONITORING_DOMAIN}/prometheusfederation/"
- "--web.route-prefix=/"
- "--storage.tsdb.allow-overlapping-blocks" # via https://jessicagreben.medium.com/prometheus-fill-in-data-for-new-recording-rules-30a14ccb8467
#- "--web.enable-admin-api" This allows messing with prometheus using its API from the CLI. Disabled for security reasons by default.
networks:
- internal
- monitored
- public
extra_hosts: []
deploy:
labels:
- traefik.enable=true
- traefik.docker.network=${PUBLIC_NETWORK}
# direct access through port
- traefik.http.services.prometheusfederation.loadbalancer.server.port=${MONITORING_PROMETHEUS_PORT}
- traefik.http.routers.prometheusfederation.rule=Host(`${MONITORING_DOMAIN}`) && PathPrefix(`/prometheusfederation`)
- traefik.http.routers.prometheusfederation.entrypoints=https
- traefik.http.routers.prometheusfederation.tls=true
- traefik.http.middlewares.prometheusfederation_stripprefixregex.stripprefixregex.regex=^/prometheusfederation
- traefik.http.routers.prometheusfederation.middlewares=ops_whitelist_ips@docker, ops_auth@docker, ops_gzip@docker, prometheusfederation_stripprefixregex
- prometheus-job=prometheusfederation
- prometheus-port=${MONITORING_PROMETHEUS_PORT}
resources:
limits:
memory: 4096M
cpus: "2"
reservations:
memory: 2048M
cpus: "0.2"
prometheuscadvisor:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.46.0
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data_cadvisor:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
Expand All @@ -111,6 +160,7 @@ services:
- public
extra_hosts: []
deploy:
replicas: 0
labels:
- traefik.enable=true
- traefik.docker.network=${PUBLIC_NETWORK}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "avg(production_members_in_gid_135745)",
"expr": "avg(osparc_production_members_in_gid_135745)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "avg(production_members_in_gid_1)",
"expr": "avg(osparc_production_members_in_gid_1)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
10 changes: 5 additions & 5 deletions services/monitoring/pgsql_query_exporter_config.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ databases:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}
dsn: postgresql://{{POSTGRES_USER}}:{{MONITORED_POSTGRES_PASSWORDS.split(",")[loop.index0]}}@{{_stack + '_postgres' if 'rds.amazonaws.com' not in POSTGRES_HOST else POSTGRES_HOST}}:{{POSTGRES_PORT}}/{{POSTGRES_DB}}{% endfor %}

metrics:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% for _gid in MONITORING_PROMETHEUS_PGSQL_GID_MONITORED.split(",") if _gid != "" %}
{{_stack}}_members_in_gid_{{_gid}}:
osparc_{_stack}}_members_in_gid_{{_gid}}:
type: gauge
description: Number of users in group {{_gid}}{% endfor %}
{{_stack}}_total_number_of_users:
osparc_{{_stack}}_total_number_of_users:
type: gauge
description: Total nuber of registered users{% endfor %}

Expand All @@ -15,7 +15,7 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f
query_{{_stack}}_members_in_gid_{{_gid}}:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_members_in_gid_{{_gid}}]
metrics: [osparc_{{_stack}}_members_in_gid_{{_gid}}]
sql: |
SELECT COUNT(*) as {{_stack}}_members_in_gid_{{_gid}}
FROM users
Expand All @@ -25,14 +25,14 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f
query_{{_stack}}_total_number_of_users:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_total_number_of_users]
metrics: [osparc_{{_stack}}_total_number_of_users]
sql: |
SELECT COUNT(*) as {{_stack}}_total_number_of_users
FROM users;
query_{{_stack}}_total_number_of_users_excluding_guests:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_total_number_of_users]
metrics: [osparc_{{_stack}}_total_number_of_users]
sql: |
SELECT COUNT(*) as {{_stack}}_total_number_of_users
FROM users WHERE role <> 'GUEST';{% endfor %}
18 changes: 18 additions & 0 deletions services/monitoring/prometheus/prometheus-federation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
global:
scrape_interval: 29s # Set the scrape interval to every 29 seconds. Default is every 1 minute.
evaluation_interval: 29s # Evaluate rules every 29 seconds. The default is every 1 minute.

scrape_configs:
- job_name: 'federate' # A job defines a series of targets and parameters describing how to scrape them.
scrape_interval: 29s # Overwrite the global scrape interval for this job, set to every 29 seconds.
honor_labels: true # Do not overwrite labels in scraped data.
scheme: http
metrics_path: '/federate' # Path to fetch the metrics from, '/federate' is for federation.

params:
'match[]': # The match[] parameter is used to select the metrics to retrieve. Logical OR is used between match[] parameters.
- '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 's4l_'
- '{__name__=~"osparc_.*"}' # Regex to select all metrics that start with 'osparc_'

static_configs:
- targets: ['prometheuscatchall:9090'] # The targets section is where you specify the host and port of the Prometheus instances to scrape.
2 changes: 1 addition & 1 deletion services/monitoring/prometheus/prometheus-simcore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ scrape_configs:
target_label: service_name
replacement: $1
action: replace
# Handle with "tasks." when there are multiple replica accrding to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/
# Handle with "tasks." when there are multiple replica according to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/
dns_sd_configs:
- names:
- "tasks.production_webserver"
Expand Down
62 changes: 26 additions & 36 deletions services/monitoring/prometheus/prometheus.rules.yml
Original file line number Diff line number Diff line change
@@ -1,49 +1,39 @@
groups:
- name: http_requests_total-rate-5min
- name: osparc_webserver_services_started_total_sum_by_key_tag
rules:
- record: http_requests_total:rate5m
expr: rate(http_requests_total[5m])

- name: container_tasks_state-count_by_image
rules:
- record: container_tasks_state:count_by_image
expr: count by (image)(container_tasks_state{state="running", image=~".*/simcore/services/.*"})

- name: simcore_simcore_service_webserver_services_started_total-sum_by_key_tag
rules:
- record: simcore_simcore_service_webserver_services_started_total:sum_by_service_key_service_tag
- record: osparc_webserver_services_started_total_sum_by_key_tag
expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_webserver_services_started_total)

- name: simcore_simcore_service_director_services_started_total-sum_by_key_tag
- name: osparc_director_services_started_total_sum_by_key_tag
rules:
- record: simcore_simcore_service_director_services_started_total:sum_by_service_key_service_tag
- record: osparc_director_services_started_total_sum_by_key_tag
expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_director_services_started_total)
- name: node_cpu_seconds_total-nonidle-sum_over_nodes
rules:
- record: node_cpu_seconds_total:nonidle_sum_over_nodes
expr: sum(node_cpu_seconds_total{mode!="idle"})
- name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks
interval: 1h
rules:
- record: node_cpu_seconds_total:nonidle_increase_over_nodes_12weeks
expr: sum(increase(node_cpu_seconds_total{mode!="idle"}[12w]))
- name: cpu_usage_per_simcore_service
interval: 120s
rules:
- record: osparc_metrics:cpu_usage_per_simcore_service
expr: sum by (service_name, instance, node_name) (label_replace(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=~".*simcore.*"}[1m]), "service_name", "$1", "container_label_com_docker_swarm_service_name", ".*_(.*)")) * 100
- name: cpu_usage_per_node
- name: osparc_cpu_usage_per_node
interval: 60s
rules:
- record: osparc_metrics:cpu_usage_per_node_percentage
- record: osparc_cpu_usage_per_node_percentage
expr: 100 - (avg(irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[50s])) by (instance,node_name) * 100)
- name: cpu_seconds_per_node
- name: osparc_cpu_seconds_per_node
interval: 60s
rules:
- record: osparc_metrics:cpu_seconds_per_node
- record: osparc_cpu_seconds_per_node
expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"})
- name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks-v2
- name: osparc_node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks
interval: 180s
rules:
- record: node_cpu_seconds_total_v2:nonidle_increase_over_nodes_12weeks_v2
expr: sum(increase(osparc_metrics:cpu_seconds_per_node[12w]))
- record: osparc_node_cpu_seconds_total_nonidle_increase_over_nodes_12weeks
expr: sum(increase(osparc_cpu_seconds_per_node[12w]))
- name: osparc_container_instances_s4lcorelite
interval: 60s
rules:
- record: osparc_cpu_usage_per_node_percentage
expr: count(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}) OR clamp_max(absent(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}),0)
- name: osparc_autoscaling_machines_active
interval: 60s
rules:
- record: osparc_autoscaling_machines_active
expr: sum(count_values("instance",node_exporter_build_info{instance=~"^ip-.*$"}))
- name: osparc_autoscaling_machines_buffer
interval: 60s
rules:
- record: osparc_autoscaling_machines_buffer
expr: sum(swarm_node_info{instance=~"^ip-.*$"}) - ( sum(count_values("instance",node_boot_time_seconds{instance=~"^ip-.*$"})) OR clamp_max(absent(node_boot_time_seconds{instance=~"^ip-.*$"}),0) )
1 change: 1 addition & 0 deletions services/monitoring/template.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
MONITORING_DOMAIN=${MONITORING_DOMAIN}
MONITORING_PROMETHEUS_RETENTION=${MONITORING_PROMETHEUS_RETENTION}
MONITORING_PROMETHEUS_FEDERATION_RETENTION=${MONITORING_PROMETHEUS_FEDERATION_RETENTION}
MONITORING_PROMETHEUS_PORT=${MONITORING_PROMETHEUS_PORT}
MONITORED_STACK_NAMES=${MONITORED_STACK_NAMES}
MONITORED_POSTGRES_PASSWORDS=${MONITORED_POSTGRES_PASSWORDS}
Expand Down

0 comments on commit dae66c4

Please sign in to comment.