Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add: Prometheus Federation 🚧 #439

Merged
merged 10 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 3 additions & 17 deletions scripts/deployments/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ minio_enabled=1
start_simcore=0
start_opsstack=0
stack_target=local
without_deploy_agent=1

usage="$(basename "$0") [-h] [--key=value]

Expand All @@ -61,8 +60,7 @@ where keys are:
--minio_enabled (default: ${minio_enabled})
--start_opsstack (default: ${start_opsstack})
--stack_target (default: ${stack_target})
--vcs_check (default: ${vcs_check})
--without_deploy_agent (default: ${without_deploy_agent})"
--vcs_check (default: ${vcs_check})"

for i in "$@"; do
case $i in # Infos on bash case statements https://linuxize.com/post/bash-case-statement/
Expand All @@ -86,10 +84,6 @@ for i in "$@"; do
vcs_check="${i#*=}"
;;
##
--without_deploy_agent=*)
without_deploy_agent="${i#*=}"
;;
##
:|--help|-h)
echo "$usage" && exit 0
shift
Expand Down Expand Up @@ -266,16 +260,8 @@ if [ "$start_opsstack" -eq 0 ]; then
popd
fi
if [ "$start_simcore" -eq 0 ]; then
if [ "$without_deploy_agent" -eq 0 ]; then
log_info "starting simcore without deployment agent..."
"${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash
else
# -------------------------------- DEPlOYMENT-AGENT -------------------------------
log_info "starting deployment-agent for simcore..."
pushd "${repo_basedir}"/services/deployment-agent;
make down up-"$stack_target";
popd
fi
log_info "starting simcore..."
"${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash
fi
# shellcheck disable=2235

Expand Down
56 changes: 53 additions & 3 deletions services/monitoring/docker-compose.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ volumes:
grafana_data: {}
alertmanager_data: {}
prometheus_data_cadvisor: {}
prometheus_data_federation: {}

networks:
internal:
Expand All @@ -26,6 +27,8 @@ configs:
file: ./prometheus/prometheus.yml
prometheus_config_cadvisor:
file: ./prometheus/prometheus-cadvisor.yml
prometheus_config_federation:
file: ./prometheus/prometheus-federation.yml
prometheus_rules:
file: ./prometheus/prometheus.rules.yml
grafana_image_renderer_config:
Expand All @@ -37,7 +40,7 @@ configs:
services:
prometheuscatchall:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.46.0
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
Expand Down Expand Up @@ -82,10 +85,56 @@ services:
reservations:
memory: 2048M
cpus: "0.2"

prometheusfederation:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data_federation:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
user: root # only user root can use the docker socket
configs:
- source: prometheus_config_federation
target: /etc/prometheus/prometheus.yml
- source: prometheus_rules
target: /etc/prometheus/prometheus.rules.yml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=${MONITORING_PROMETHEUS_FEDERATION_RETENTION}"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--web.external-url=https://${MONITORING_DOMAIN}/prometheusfederation/"
- "--web.route-prefix=/"
- "--storage.tsdb.allow-overlapping-blocks" # via https://jessicagreben.medium.com/prometheus-fill-in-data-for-new-recording-rules-30a14ccb8467
#- "--web.enable-admin-api" This allows messing with prometheus using its API from the CLI. Disabled for security reasons by default.
networks:
- internal
- monitored
- public
extra_hosts: []
deploy:
labels:
- traefik.enable=true
- traefik.docker.network=${PUBLIC_NETWORK}
# direct access through port
- traefik.http.services.prometheusfederation.loadbalancer.server.port=${MONITORING_PROMETHEUS_PORT}
- traefik.http.routers.prometheusfederation.rule=Host(`${MONITORING_DOMAIN}`) && PathPrefix(`/prometheusfederation`)
- traefik.http.routers.prometheusfederation.entrypoints=https
- traefik.http.routers.prometheusfederation.tls=true
- traefik.http.middlewares.prometheusfederation_stripprefixregex.stripprefixregex.regex=^/prometheusfederation
- traefik.http.routers.prometheusfederation.middlewares=ops_whitelist_ips@docker, ops_auth@docker, ops_gzip@docker, prometheusfederation_stripprefixregex
- prometheus-job=prometheusfederation
- prometheus-port=${MONITORING_PROMETHEUS_PORT}
resources:
limits:
memory: 4096M
cpus: "2"
reservations:
memory: 2048M
cpus: "0.2"
prometheuscadvisor:
hostname: "{% raw %}{{.Service.Name}}{% endraw %}"
image: prom/prometheus:v2.46.0
image: prom/prometheus:v2.47.2
volumes:
- prometheus_data_cadvisor:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
Expand All @@ -111,6 +160,7 @@ services:
- public
extra_hosts: []
deploy:
replicas: 0
labels:
- traefik.enable=true
- traefik.docker.network=${PUBLIC_NETWORK}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "avg(production_members_in_gid_135745)",
"expr": "avg(osparc_production_members_in_gid_135745)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "avg(production_members_in_gid_1)",
"expr": "avg(osparc_production_members_in_gid_1)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
10 changes: 5 additions & 5 deletions services/monitoring/pgsql_query_exporter_config.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ databases:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}
dsn: postgresql://{{POSTGRES_USER}}:{{MONITORED_POSTGRES_PASSWORDS.split(",")[loop.index0]}}@{{_stack + '_postgres' if 'rds.amazonaws.com' not in POSTGRES_HOST else POSTGRES_HOST}}:{{POSTGRES_PORT}}/{{POSTGRES_DB}}{% endfor %}

metrics:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% for _gid in MONITORING_PROMETHEUS_PGSQL_GID_MONITORED.split(",") if _gid != "" %}
{{_stack}}_members_in_gid_{{_gid}}:
osparc_{_stack}}_members_in_gid_{{_gid}}:
type: gauge
description: Number of users in group {{_gid}}{% endfor %}
{{_stack}}_total_number_of_users:
osparc_{{_stack}}_total_number_of_users:
type: gauge
description: Total nuber of registered users{% endfor %}

Expand All @@ -15,7 +15,7 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f
query_{{_stack}}_members_in_gid_{{_gid}}:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_members_in_gid_{{_gid}}]
metrics: [osparc_{{_stack}}_members_in_gid_{{_gid}}]
sql: |
SELECT COUNT(*) as {{_stack}}_members_in_gid_{{_gid}}
FROM users
Expand All @@ -25,14 +25,14 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f
query_{{_stack}}_total_number_of_users:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_total_number_of_users]
metrics: [osparc_{{_stack}}_total_number_of_users]
sql: |
SELECT COUNT(*) as {{_stack}}_total_number_of_users
FROM users;
query_{{_stack}}_total_number_of_users_excluding_guests:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_total_number_of_users]
metrics: [osparc_{{_stack}}_total_number_of_users]
sql: |
SELECT COUNT(*) as {{_stack}}_total_number_of_users
FROM users WHERE role <> 'GUEST';{% endfor %}
18 changes: 18 additions & 0 deletions services/monitoring/prometheus/prometheus-federation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
global:
scrape_interval: 29s # Set the scrape interval to every 29 seconds. Default is every 1 minute.
evaluation_interval: 29s # Evaluate rules every 29 seconds. The default is every 1 minute.

scrape_configs:
- job_name: 'federate' # A job defines a series of targets and parameters describing how to scrape them.
scrape_interval: 29s # Overwrite the global scrape interval for this job, set to every 29 seconds.
honor_labels: true # Do not overwrite labels in scraped data.
scheme: http
metrics_path: '/federate' # Path to fetch the metrics from, '/federate' is for federation.

params:
'match[]': # The match[] parameter is used to select the metrics to retrieve. Logical OR is used between match[] parameters.
- '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 's4l_'
- '{__name__=~"osparc_.*"}' # Regex to select all metrics that start with 'osparc_'

static_configs:
- targets: ['prometheuscatchall:9090'] # The targets section is where you specify the host and port of the Prometheus instances to scrape.
2 changes: 1 addition & 1 deletion services/monitoring/prometheus/prometheus-simcore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ scrape_configs:
target_label: service_name
replacement: $1
action: replace
# Handle with "tasks." when there are multiple replica accrding to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/
# Handle with "tasks." when there are multiple replica according to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/
dns_sd_configs:
- names:
- "tasks.production_webserver"
Expand Down
62 changes: 26 additions & 36 deletions services/monitoring/prometheus/prometheus.rules.yml
Original file line number Diff line number Diff line change
@@ -1,49 +1,39 @@
groups:
- name: http_requests_total-rate-5min
- name: osparc_webserver_services_started_total_sum_by_key_tag
rules:
- record: http_requests_total:rate5m
expr: rate(http_requests_total[5m])

- name: container_tasks_state-count_by_image
rules:
- record: container_tasks_state:count_by_image
expr: count by (image)(container_tasks_state{state="running", image=~".*/simcore/services/.*"})

- name: simcore_simcore_service_webserver_services_started_total-sum_by_key_tag
rules:
- record: simcore_simcore_service_webserver_services_started_total:sum_by_service_key_service_tag
- record: osparc_webserver_services_started_total_sum_by_key_tag
expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_webserver_services_started_total)

- name: simcore_simcore_service_director_services_started_total-sum_by_key_tag
- name: osparc_director_services_started_total_sum_by_key_tag
rules:
- record: simcore_simcore_service_director_services_started_total:sum_by_service_key_service_tag
- record: osparc_director_services_started_total_sum_by_key_tag
expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_director_services_started_total)
- name: node_cpu_seconds_total-nonidle-sum_over_nodes
rules:
- record: node_cpu_seconds_total:nonidle_sum_over_nodes
expr: sum(node_cpu_seconds_total{mode!="idle"})
- name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks
interval: 1h
rules:
- record: node_cpu_seconds_total:nonidle_increase_over_nodes_12weeks
expr: sum(increase(node_cpu_seconds_total{mode!="idle"}[12w]))
- name: cpu_usage_per_simcore_service
interval: 120s
rules:
- record: osparc_metrics:cpu_usage_per_simcore_service
expr: sum by (service_name, instance, node_name) (label_replace(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=~".*simcore.*"}[1m]), "service_name", "$1", "container_label_com_docker_swarm_service_name", ".*_(.*)")) * 100
- name: cpu_usage_per_node
- name: osparc_cpu_usage_per_node
interval: 60s
rules:
- record: osparc_metrics:cpu_usage_per_node_percentage
- record: osparc_cpu_usage_per_node_percentage
expr: 100 - (avg(irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[50s])) by (instance,node_name) * 100)
- name: cpu_seconds_per_node
- name: osparc_cpu_seconds_per_node
interval: 60s
rules:
- record: osparc_metrics:cpu_seconds_per_node
- record: osparc_cpu_seconds_per_node
expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"})
- name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks-v2
- name: osparc_node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks
interval: 180s
rules:
- record: node_cpu_seconds_total_v2:nonidle_increase_over_nodes_12weeks_v2
expr: sum(increase(osparc_metrics:cpu_seconds_per_node[12w]))
- record: osparc_node_cpu_seconds_total_nonidle_increase_over_nodes_12weeks
expr: sum(increase(osparc_cpu_seconds_per_node[12w]))
- name: osparc_container_instances_s4lcorelite
interval: 60s
rules:
- record: osparc_cpu_usage_per_node_percentage
expr: count(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}) OR clamp_max(absent(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}),0)
- name: osparc_autoscaling_machines_active
interval: 60s
rules:
- record: osparc_autoscaling_machines_active
expr: sum(count_values("instance",node_exporter_build_info{instance=~"^ip-.*$"}))
- name: osparc_autoscaling_machines_buffer
interval: 60s
rules:
- record: osparc_autoscaling_machines_buffer
expr: sum(swarm_node_info{instance=~"^ip-.*$"}) - ( sum(count_values("instance",node_boot_time_seconds{instance=~"^ip-.*$"})) OR clamp_max(absent(node_boot_time_seconds{instance=~"^ip-.*$"}),0) )
1 change: 1 addition & 0 deletions services/monitoring/template.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
MONITORING_DOMAIN=${MONITORING_DOMAIN}
MONITORING_PROMETHEUS_RETENTION=${MONITORING_PROMETHEUS_RETENTION}
MONITORING_PROMETHEUS_FEDERATION_RETENTION=${MONITORING_PROMETHEUS_FEDERATION_RETENTION}
mrnicegyu11 marked this conversation as resolved.
Show resolved Hide resolved
MONITORING_PROMETHEUS_PORT=${MONITORING_PROMETHEUS_PORT}
MONITORED_STACK_NAMES=${MONITORED_STACK_NAMES}
MONITORED_POSTGRES_PASSWORDS=${MONITORED_POSTGRES_PASSWORDS}
Expand Down
Loading