From 5dfa686668cf2d9e1fac3e8653864a6a2379e99f Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Thu, 16 Nov 2023 15:51:24 +0100 Subject: [PATCH] Rename and purge prometheus rules, add common prefixes s4l_ and osparc_ --- .../simcore/s4l-lite-admin-overview.json | 2 +- .../dashboards/simcore/admin-overview.json | 2 +- .../pgsql_query_exporter_config.yaml.j2 | 10 +-- .../prometheus/prometheus-federation.yml | 5 +- .../prometheus/prometheus.rules.yml | 62 ++++++++----------- 5 files changed, 35 insertions(+), 46 deletions(-) diff --git a/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json b/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json index 54c86e1f..665837c3 100644 --- a/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json +++ b/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json @@ -87,7 +87,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "avg(production_members_in_gid_135745)", + "expr": "avg(osparc_production_members_in_gid_135745)", "legendFormat": "__auto", "range": true, "refId": "A" diff --git a/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json b/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json index 3a8cc228..a3455078 100644 --- a/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json +++ b/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json @@ -87,7 +87,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "avg(production_members_in_gid_1)", + "expr": "avg(osparc_production_members_in_gid_1)", "legendFormat": "__auto", "range": true, "refId": "A" diff --git a/services/monitoring/pgsql_query_exporter_config.yaml.j2 b/services/monitoring/pgsql_query_exporter_config.yaml.j2 index 09684c20..1e55ab9e 100644 --- a/services/monitoring/pgsql_query_exporter_config.yaml.j2 +++ b/services/monitoring/pgsql_query_exporter_config.yaml.j2 @@ -3,10 +3,10 @@ databases:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %} dsn: postgresql://{{POSTGRES_USER}}:{{MONITORED_POSTGRES_PASSWORDS.split(",")[loop.index0]}}@{{_stack + '_postgres' if 'rds.amazonaws.com' not in POSTGRES_HOST else POSTGRES_HOST}}:{{POSTGRES_PORT}}/{{POSTGRES_DB}}{% endfor %} metrics:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% for _gid in MONITORING_PROMETHEUS_PGSQL_GID_MONITORED.split(",") if _gid != "" %} - {{_stack}}_members_in_gid_{{_gid}}: + osparc_{_stack}}_members_in_gid_{{_gid}}: type: gauge description: Number of users in group {{_gid}}{% endfor %} - {{_stack}}_total_number_of_users: + osparc_{{_stack}}_total_number_of_users: type: gauge description: Total nuber of registered users{% endfor %} @@ -15,7 +15,7 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f query_{{_stack}}_members_in_gid_{{_gid}}: interval: 55 databases: [postgres_{{_stack}}] - metrics: [{{_stack}}_members_in_gid_{{_gid}}] + metrics: [osparc_{{_stack}}_members_in_gid_{{_gid}}] sql: | SELECT COUNT(*) as {{_stack}}_members_in_gid_{{_gid}} FROM users @@ -25,14 +25,14 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f query_{{_stack}}_total_number_of_users: interval: 55 databases: [postgres_{{_stack}}] - metrics: [{{_stack}}_total_number_of_users] + metrics: [osparc_{{_stack}}_total_number_of_users] sql: | SELECT COUNT(*) as {{_stack}}_total_number_of_users FROM users; query_{{_stack}}_total_number_of_users_excluding_guests: interval: 55 databases: [postgres_{{_stack}}] - metrics: [{{_stack}}_total_number_of_users] + metrics: [osparc_{{_stack}}_total_number_of_users] sql: | SELECT COUNT(*) as {{_stack}}_total_number_of_users FROM users WHERE role <> 'GUEST';{% endfor %} diff --git a/services/monitoring/prometheus/prometheus-federation.yml b/services/monitoring/prometheus/prometheus-federation.yml index c90e2302..d7ae2311 100644 --- a/services/monitoring/prometheus/prometheus-federation.yml +++ b/services/monitoring/prometheus/prometheus-federation.yml @@ -11,9 +11,8 @@ scrape_configs: params: 'match[]': # The match[] parameter is used to select the metrics to retrieve. Logical OR is used between match[] parameters. - - '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 'http_' - - '{__name__=~"up"}' # Regex to select the metric 'up' - - '{job="dy-sidecar-exporter"}' + - '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 's4l_' + - '{__name__=~"osparc_.*"}' # Regex to select all metrics that start with 'osparc_' static_configs: - targets: ['prometheuscatchall:9090'] # The targets section is where you specify the host and port of the Prometheus instances to scrape. diff --git a/services/monitoring/prometheus/prometheus.rules.yml b/services/monitoring/prometheus/prometheus.rules.yml index d77c95e3..a80a54af 100644 --- a/services/monitoring/prometheus/prometheus.rules.yml +++ b/services/monitoring/prometheus/prometheus.rules.yml @@ -1,49 +1,39 @@ groups: - - name: http_requests_total-rate-5min + - name: osparc_webserver_services_started_total-sum_by_key_tag rules: - - record: http_requests_total:rate5m - expr: rate(http_requests_total[5m]) - - - name: container_tasks_state-count_by_image - rules: - - record: container_tasks_state:count_by_image - expr: count by (image)(container_tasks_state{state="running", image=~".*/simcore/services/.*"}) - - - name: simcore_simcore_service_webserver_services_started_total-sum_by_key_tag - rules: - - record: simcore_simcore_service_webserver_services_started_total:sum_by_service_key_service_tag + - record: osparc_webserver_services_started_total_sum_by_service_key_service_tag expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_webserver_services_started_total) - - - name: simcore_simcore_service_director_services_started_total-sum_by_key_tag + - name: osparc_director_services_started_total-sum_by_key_tag rules: - - record: simcore_simcore_service_director_services_started_total:sum_by_service_key_service_tag + - record: osparc_director_services_started_total_sum_by_service_key_service_tag expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_director_services_started_total) - - name: node_cpu_seconds_total-nonidle-sum_over_nodes - rules: - - record: node_cpu_seconds_total:nonidle_sum_over_nodes - expr: sum(node_cpu_seconds_total{mode!="idle"}) - - name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks - interval: 1h - rules: - - record: node_cpu_seconds_total:nonidle_increase_over_nodes_12weeks - expr: sum(increase(node_cpu_seconds_total{mode!="idle"}[12w])) - - name: cpu_usage_per_simcore_service - interval: 120s - rules: - - record: osparc_metrics:cpu_usage_per_simcore_service - expr: sum by (service_name, instance, node_name) (label_replace(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=~".*simcore.*"}[1m]), "service_name", "$1", "container_label_com_docker_swarm_service_name", ".*_(.*)")) * 100 - - name: cpu_usage_per_node + - name: osparc_cpu_usage_per_node interval: 60s rules: - - record: osparc_metrics:cpu_usage_per_node_percentage + - record: osparc_cpu_usage_per_node_percentage expr: 100 - (avg(irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[50s])) by (instance,node_name) * 100) - - name: cpu_seconds_per_node + - name: osparc_cpu_seconds_per_node interval: 60s rules: - - record: osparc_metrics:cpu_seconds_per_node + - record: osparc_osparc_cpu_seconds_per_node expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) - - name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks-v2 + - name: osparc_node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks interval: 180s rules: - - record: node_cpu_seconds_total_v2:nonidle_increase_over_nodes_12weeks_v2 - expr: sum(increase(osparc_metrics:cpu_seconds_per_node[12w])) + - record: osparc_node_cpu_seconds_total_nonidle_increase_over_nodes_12weeks + expr: sum(increase(osparc_cpu_seconds_per_node[12w])) + - name: osparc_container_instances_s4lcorelite + interval: 60s + rules: + - record: osparc_cpu_usage_per_node_percentage + expr: count(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}) OR clamp_max(absent(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}),0) + - name: osparc_autoscaling_machines_active + interval: 60s + rules: + - record: osparc_autoscaling_machines_active + expr: sum(count_values("instance",node_exporter_build_info{instance=~"^ip-.*$"})) + - name: osparc_autoscaling_machines_buffer + interval: 60s + rules: + - record: osparc_autoscaling_machines_buffer + expr: sum(swarm_node_info{instance=~"^ip-.*$"}) - ( sum(count_values("instance",node_boot_time_seconds{instance=~"^ip-.*$"})) OR clamp_max(absent(node_boot_time_seconds{instance=~"^ip-.*$"}),0) )