From 197b3f9a83936d8cb5d0b6f8728129c5dd537a88 Mon Sep 17 00:00:00 2001 From: kaiser Date: Tue, 14 Nov 2023 15:18:46 +0100 Subject: [PATCH 1/7] Fix typo --- services/monitoring/prometheus/prometheus-simcore.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/prometheus/prometheus-simcore.yml b/services/monitoring/prometheus/prometheus-simcore.yml index b6d0b8dd..9a840d0e 100644 --- a/services/monitoring/prometheus/prometheus-simcore.yml +++ b/services/monitoring/prometheus/prometheus-simcore.yml @@ -15,7 +15,7 @@ scrape_configs: target_label: service_name replacement: $1 action: replace - # Handle with "tasks." when there are multiple replica accrding to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/ + # Handle with "tasks." when there are multiple replica according to https://www.innoq.com/en/blog/scraping-docker-swarm-service-instances-with-prometheus/ dns_sd_configs: - names: - "tasks.production_webserver" From 1f6ff31b61d6b2541fd84c8ffe1d97ad826e78fe Mon Sep 17 00:00:00 2001 From: kaiser Date: Wed, 15 Nov 2023 10:56:41 +0100 Subject: [PATCH 2/7] Add federated prometheus, bump prometheus minor version --- services/monitoring/docker-compose.yml.j2 | 54 +++++++++++++++++-- .../prometheus/prometheus-federation.yml | 19 +++++++ services/monitoring/template.env | 1 + 3 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 services/monitoring/prometheus/prometheus-federation.yml diff --git a/services/monitoring/docker-compose.yml.j2 b/services/monitoring/docker-compose.yml.j2 index cffaf637..72f70a7f 100644 --- a/services/monitoring/docker-compose.yml.j2 +++ b/services/monitoring/docker-compose.yml.j2 @@ -26,6 +26,8 @@ configs: file: ./prometheus/prometheus.yml prometheus_config_cadvisor: file: ./prometheus/prometheus-cadvisor.yml + prometheus_config_federation: + file: ./prometheus/prometheus-federation.yml prometheus_rules: file: ./prometheus/prometheus.rules.yml grafana_image_renderer_config: @@ -37,7 +39,7 @@ configs: services: prometheuscatchall: hostname: "{% raw %}{{.Service.Name}}{% endraw %}" - image: prom/prometheus:v2.46.0 + image: prom/prometheus:v2.47.2 volumes: - prometheus_data:/prometheus - /var/run/docker.sock:/var/run/docker.sock:ro @@ -82,10 +84,56 @@ services: reservations: memory: 2048M cpus: "0.2" - +prometheusfederation: + hostname: "{% raw %}{{.Service.Name}}{% endraw %}" + image: prom/prometheus:v2.47.2 + volumes: + - prometheus_data_federation:/prometheus + - /var/run/docker.sock:/var/run/docker.sock:ro + user: root # only user root can use the docker socket + configs: + - source: prometheus_config_federation + target: /etc/prometheus/prometheus.yml + - source: prometheus_rules + target: /etc/prometheus/prometheus.rules.yml + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--storage.tsdb.retention=${MONITORING_PROMETHEUS_FEDERATION_RETENTION}" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" + - "--web.external-url=https://${MONITORING_DOMAIN}/prometheusfederation/" + - "--web.route-prefix=/" + - "--storage.tsdb.allow-overlapping-blocks" # via https://jessicagreben.medium.com/prometheus-fill-in-data-for-new-recording-rules-30a14ccb8467 + #- "--web.enable-admin-api" This allows messing with prometheus using its API from the CLI. Disabled for security reasons by default. + networks: + - internal + - monitored + - public + extra_hosts: [] + deploy: + labels: + - traefik.enable=true + - traefik.docker.network=${PUBLIC_NETWORK} + # direct access through port + - traefik.http.services.prometheusfederation.loadbalancer.server.port=${MONITORING_PROMETHEUS_PORT} + - traefik.http.routers.prometheusfederation.rule=Host(`${MONITORING_DOMAIN}`) && PathPrefix(`/prometheusfederation`) + - traefik.http.routers.prometheusfederation.entrypoints=https + - traefik.http.routers.prometheusfederation.tls=true + - traefik.http.middlewares.prometheusfederation_stripprefixregex.stripprefixregex.regex=^/prometheusfederation + - traefik.http.routers.prometheusfederation.middlewares=ops_whitelist_ips@docker, ops_auth@docker, ops_gzip@docker, prometheusfederation_stripprefixregex + - prometheus-job=prometheusfederation + - prometheus-port=${MONITORING_PROMETHEUS_PORT} + resources: + limits: + memory: 4096M + cpus: "2" + reservations: + memory: 2048M + cpus: "0.2" prometheuscadvisor: hostname: "{% raw %}{{.Service.Name}}{% endraw %}" - image: prom/prometheus:v2.46.0 + image: prom/prometheus:v2.47.2 volumes: - prometheus_data_cadvisor:/prometheus - /var/run/docker.sock:/var/run/docker.sock:ro diff --git a/services/monitoring/prometheus/prometheus-federation.yml b/services/monitoring/prometheus/prometheus-federation.yml new file mode 100644 index 00000000..c90e2302 --- /dev/null +++ b/services/monitoring/prometheus/prometheus-federation.yml @@ -0,0 +1,19 @@ +global: + scrape_interval: 29s # Set the scrape interval to every 29 seconds. Default is every 1 minute. + evaluation_interval: 29s # Evaluate rules every 29 seconds. The default is every 1 minute. + +scrape_configs: + - job_name: 'federate' # A job defines a series of targets and parameters describing how to scrape them. + scrape_interval: 29s # Overwrite the global scrape interval for this job, set to every 29 seconds. + honor_labels: true # Do not overwrite labels in scraped data. + scheme: http + metrics_path: '/federate' # Path to fetch the metrics from, '/federate' is for federation. + + params: + 'match[]': # The match[] parameter is used to select the metrics to retrieve. Logical OR is used between match[] parameters. + - '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 'http_' + - '{__name__=~"up"}' # Regex to select the metric 'up' + - '{job="dy-sidecar-exporter"}' + + static_configs: + - targets: ['prometheuscatchall:9090'] # The targets section is where you specify the host and port of the Prometheus instances to scrape. diff --git a/services/monitoring/template.env b/services/monitoring/template.env index e370d903..da81ae34 100644 --- a/services/monitoring/template.env +++ b/services/monitoring/template.env @@ -1,5 +1,6 @@ MONITORING_DOMAIN=${MONITORING_DOMAIN} MONITORING_PROMETHEUS_RETENTION=${MONITORING_PROMETHEUS_RETENTION} +MONITORING_PROMETHEUS_FEDERATION_RETENTION=${MONITORING_PROMETHEUS_FEDERATION_RETENTION} MONITORING_PROMETHEUS_PORT=${MONITORING_PROMETHEUS_PORT} MONITORED_STACK_NAMES=${MONITORED_STACK_NAMES} MONITORED_POSTGRES_PASSWORDS=${MONITORED_POSTGRES_PASSWORDS} From 503100b12285f65907d9a5c98d24a6616ad4b20d Mon Sep 17 00:00:00 2001 From: kaiser Date: Wed, 15 Nov 2023 10:58:11 +0100 Subject: [PATCH 3/7] Scale cadvisor resource monitoring prometheus to zero --- services/monitoring/docker-compose.yml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/docker-compose.yml.j2 b/services/monitoring/docker-compose.yml.j2 index 72f70a7f..3a312f94 100644 --- a/services/monitoring/docker-compose.yml.j2 +++ b/services/monitoring/docker-compose.yml.j2 @@ -40,6 +40,7 @@ services: prometheuscatchall: hostname: "{% raw %}{{.Service.Name}}{% endraw %}" image: prom/prometheus:v2.47.2 + replicas: 0 volumes: - prometheus_data:/prometheus - /var/run/docker.sock:/var/run/docker.sock:ro From 085efab97272396ba798a8e44e26b0b41c5e8345 Mon Sep 17 00:00:00 2001 From: kaiser Date: Wed, 15 Nov 2023 15:42:48 +0100 Subject: [PATCH 4/7] minor fixes --- scripts/deployments/deploy.sh | 20 +++----------------- services/monitoring/docker-compose.yml.j2 | 5 +++-- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/scripts/deployments/deploy.sh b/scripts/deployments/deploy.sh index 5a8fa032..a6278b95 100755 --- a/scripts/deployments/deploy.sh +++ b/scripts/deployments/deploy.sh @@ -49,7 +49,6 @@ minio_enabled=1 start_simcore=0 start_opsstack=0 stack_target=local -without_deploy_agent=1 usage="$(basename "$0") [-h] [--key=value] @@ -61,8 +60,7 @@ where keys are: --minio_enabled (default: ${minio_enabled}) --start_opsstack (default: ${start_opsstack}) --stack_target (default: ${stack_target}) - --vcs_check (default: ${vcs_check}) - --without_deploy_agent (default: ${without_deploy_agent})" + --vcs_check (default: ${vcs_check})" for i in "$@"; do case $i in # Infos on bash case statements https://linuxize.com/post/bash-case-statement/ @@ -86,10 +84,6 @@ for i in "$@"; do vcs_check="${i#*=}" ;; ## - --without_deploy_agent=*) - without_deploy_agent="${i#*=}" - ;; - ## :|--help|-h) echo "$usage" && exit 0 shift @@ -266,16 +260,8 @@ if [ "$start_opsstack" -eq 0 ]; then popd fi if [ "$start_simcore" -eq 0 ]; then - if [ "$without_deploy_agent" -eq 0 ]; then - log_info "starting simcore without deployment agent..." - "${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash - else - # -------------------------------- DEPlOYMENT-AGENT ------------------------------- - log_info "starting deployment-agent for simcore..." - pushd "${repo_basedir}"/services/deployment-agent; - make down up-"$stack_target"; - popd - fi + log_info "starting simcore..." + "${repo_basedir}"/scripts/deployments/start_without_deployment_agent.bash fi # shellcheck disable=2235 diff --git a/services/monitoring/docker-compose.yml.j2 b/services/monitoring/docker-compose.yml.j2 index 3a312f94..f2ceb475 100644 --- a/services/monitoring/docker-compose.yml.j2 +++ b/services/monitoring/docker-compose.yml.j2 @@ -5,6 +5,7 @@ volumes: grafana_data: {} alertmanager_data: {} prometheus_data_cadvisor: {} + prometheus_data_federation: {} networks: internal: @@ -40,7 +41,6 @@ services: prometheuscatchall: hostname: "{% raw %}{{.Service.Name}}{% endraw %}" image: prom/prometheus:v2.47.2 - replicas: 0 volumes: - prometheus_data:/prometheus - /var/run/docker.sock:/var/run/docker.sock:ro @@ -85,7 +85,7 @@ services: reservations: memory: 2048M cpus: "0.2" -prometheusfederation: + prometheusfederation: hostname: "{% raw %}{{.Service.Name}}{% endraw %}" image: prom/prometheus:v2.47.2 volumes: @@ -160,6 +160,7 @@ prometheusfederation: - public extra_hosts: [] deploy: + replicas: 0 labels: - traefik.enable=true - traefik.docker.network=${PUBLIC_NETWORK} From 5dfa686668cf2d9e1fac3e8653864a6a2379e99f Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Thu, 16 Nov 2023 15:51:24 +0100 Subject: [PATCH 5/7] Rename and purge prometheus rules, add common prefixes s4l_ and osparc_ --- .../simcore/s4l-lite-admin-overview.json | 2 +- .../dashboards/simcore/admin-overview.json | 2 +- .../pgsql_query_exporter_config.yaml.j2 | 10 +-- .../prometheus/prometheus-federation.yml | 5 +- .../prometheus/prometheus.rules.yml | 62 ++++++++----------- 5 files changed, 35 insertions(+), 46 deletions(-) diff --git a/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json b/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json index 54c86e1f..665837c3 100644 --- a/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json +++ b/services/monitoring/grafana/provisioning/osparc.io/dashboards/simcore/s4l-lite-admin-overview.json @@ -87,7 +87,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "avg(production_members_in_gid_135745)", + "expr": "avg(osparc_production_members_in_gid_135745)", "legendFormat": "__auto", "range": true, "refId": "A" diff --git a/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json b/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json index 3a8cc228..a3455078 100644 --- a/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json +++ b/services/monitoring/grafana/provisioning/tip.itis.swiss/dashboards/simcore/admin-overview.json @@ -87,7 +87,7 @@ "uid": "RmZEr52nz" }, "editorMode": "code", - "expr": "avg(production_members_in_gid_1)", + "expr": "avg(osparc_production_members_in_gid_1)", "legendFormat": "__auto", "range": true, "refId": "A" diff --git a/services/monitoring/pgsql_query_exporter_config.yaml.j2 b/services/monitoring/pgsql_query_exporter_config.yaml.j2 index 09684c20..1e55ab9e 100644 --- a/services/monitoring/pgsql_query_exporter_config.yaml.j2 +++ b/services/monitoring/pgsql_query_exporter_config.yaml.j2 @@ -3,10 +3,10 @@ databases:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %} dsn: postgresql://{{POSTGRES_USER}}:{{MONITORED_POSTGRES_PASSWORDS.split(",")[loop.index0]}}@{{_stack + '_postgres' if 'rds.amazonaws.com' not in POSTGRES_HOST else POSTGRES_HOST}}:{{POSTGRES_PORT}}/{{POSTGRES_DB}}{% endfor %} metrics:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% for _gid in MONITORING_PROMETHEUS_PGSQL_GID_MONITORED.split(",") if _gid != "" %} - {{_stack}}_members_in_gid_{{_gid}}: + osparc_{_stack}}_members_in_gid_{{_gid}}: type: gauge description: Number of users in group {{_gid}}{% endfor %} - {{_stack}}_total_number_of_users: + osparc_{{_stack}}_total_number_of_users: type: gauge description: Total nuber of registered users{% endfor %} @@ -15,7 +15,7 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f query_{{_stack}}_members_in_gid_{{_gid}}: interval: 55 databases: [postgres_{{_stack}}] - metrics: [{{_stack}}_members_in_gid_{{_gid}}] + metrics: [osparc_{{_stack}}_members_in_gid_{{_gid}}] sql: | SELECT COUNT(*) as {{_stack}}_members_in_gid_{{_gid}} FROM users @@ -25,14 +25,14 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f query_{{_stack}}_total_number_of_users: interval: 55 databases: [postgres_{{_stack}}] - metrics: [{{_stack}}_total_number_of_users] + metrics: [osparc_{{_stack}}_total_number_of_users] sql: | SELECT COUNT(*) as {{_stack}}_total_number_of_users FROM users; query_{{_stack}}_total_number_of_users_excluding_guests: interval: 55 databases: [postgres_{{_stack}}] - metrics: [{{_stack}}_total_number_of_users] + metrics: [osparc_{{_stack}}_total_number_of_users] sql: | SELECT COUNT(*) as {{_stack}}_total_number_of_users FROM users WHERE role <> 'GUEST';{% endfor %} diff --git a/services/monitoring/prometheus/prometheus-federation.yml b/services/monitoring/prometheus/prometheus-federation.yml index c90e2302..d7ae2311 100644 --- a/services/monitoring/prometheus/prometheus-federation.yml +++ b/services/monitoring/prometheus/prometheus-federation.yml @@ -11,9 +11,8 @@ scrape_configs: params: 'match[]': # The match[] parameter is used to select the metrics to retrieve. Logical OR is used between match[] parameters. - - '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 'http_' - - '{__name__=~"up"}' # Regex to select the metric 'up' - - '{job="dy-sidecar-exporter"}' + - '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 's4l_' + - '{__name__=~"osparc_.*"}' # Regex to select all metrics that start with 'osparc_' static_configs: - targets: ['prometheuscatchall:9090'] # The targets section is where you specify the host and port of the Prometheus instances to scrape. diff --git a/services/monitoring/prometheus/prometheus.rules.yml b/services/monitoring/prometheus/prometheus.rules.yml index d77c95e3..a80a54af 100644 --- a/services/monitoring/prometheus/prometheus.rules.yml +++ b/services/monitoring/prometheus/prometheus.rules.yml @@ -1,49 +1,39 @@ groups: - - name: http_requests_total-rate-5min + - name: osparc_webserver_services_started_total-sum_by_key_tag rules: - - record: http_requests_total:rate5m - expr: rate(http_requests_total[5m]) - - - name: container_tasks_state-count_by_image - rules: - - record: container_tasks_state:count_by_image - expr: count by (image)(container_tasks_state{state="running", image=~".*/simcore/services/.*"}) - - - name: simcore_simcore_service_webserver_services_started_total-sum_by_key_tag - rules: - - record: simcore_simcore_service_webserver_services_started_total:sum_by_service_key_service_tag + - record: osparc_webserver_services_started_total_sum_by_service_key_service_tag expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_webserver_services_started_total) - - - name: simcore_simcore_service_director_services_started_total-sum_by_key_tag + - name: osparc_director_services_started_total-sum_by_key_tag rules: - - record: simcore_simcore_service_director_services_started_total:sum_by_service_key_service_tag + - record: osparc_director_services_started_total_sum_by_service_key_service_tag expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_director_services_started_total) - - name: node_cpu_seconds_total-nonidle-sum_over_nodes - rules: - - record: node_cpu_seconds_total:nonidle_sum_over_nodes - expr: sum(node_cpu_seconds_total{mode!="idle"}) - - name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks - interval: 1h - rules: - - record: node_cpu_seconds_total:nonidle_increase_over_nodes_12weeks - expr: sum(increase(node_cpu_seconds_total{mode!="idle"}[12w])) - - name: cpu_usage_per_simcore_service - interval: 120s - rules: - - record: osparc_metrics:cpu_usage_per_simcore_service - expr: sum by (service_name, instance, node_name) (label_replace(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=~".*simcore.*"}[1m]), "service_name", "$1", "container_label_com_docker_swarm_service_name", ".*_(.*)")) * 100 - - name: cpu_usage_per_node + - name: osparc_cpu_usage_per_node interval: 60s rules: - - record: osparc_metrics:cpu_usage_per_node_percentage + - record: osparc_cpu_usage_per_node_percentage expr: 100 - (avg(irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[50s])) by (instance,node_name) * 100) - - name: cpu_seconds_per_node + - name: osparc_cpu_seconds_per_node interval: 60s rules: - - record: osparc_metrics:cpu_seconds_per_node + - record: osparc_osparc_cpu_seconds_per_node expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) - - name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks-v2 + - name: osparc_node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks interval: 180s rules: - - record: node_cpu_seconds_total_v2:nonidle_increase_over_nodes_12weeks_v2 - expr: sum(increase(osparc_metrics:cpu_seconds_per_node[12w])) + - record: osparc_node_cpu_seconds_total_nonidle_increase_over_nodes_12weeks + expr: sum(increase(osparc_cpu_seconds_per_node[12w])) + - name: osparc_container_instances_s4lcorelite + interval: 60s + rules: + - record: osparc_cpu_usage_per_node_percentage + expr: count(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}) OR clamp_max(absent(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}),0) + - name: osparc_autoscaling_machines_active + interval: 60s + rules: + - record: osparc_autoscaling_machines_active + expr: sum(count_values("instance",node_exporter_build_info{instance=~"^ip-.*$"})) + - name: osparc_autoscaling_machines_buffer + interval: 60s + rules: + - record: osparc_autoscaling_machines_buffer + expr: sum(swarm_node_info{instance=~"^ip-.*$"}) - ( sum(count_values("instance",node_boot_time_seconds{instance=~"^ip-.*$"})) OR clamp_max(absent(node_boot_time_seconds{instance=~"^ip-.*$"}),0) ) From f5e67ebe5a0ac6bff3d654e304609f2b9d0129bb Mon Sep 17 00:00:00 2001 From: kaiser Date: Fri, 17 Nov 2023 11:42:39 +0100 Subject: [PATCH 6/7] fix typo --- services/monitoring/prometheus/prometheus.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/prometheus/prometheus.rules.yml b/services/monitoring/prometheus/prometheus.rules.yml index a80a54af..e4c90be8 100644 --- a/services/monitoring/prometheus/prometheus.rules.yml +++ b/services/monitoring/prometheus/prometheus.rules.yml @@ -15,7 +15,7 @@ groups: - name: osparc_cpu_seconds_per_node interval: 60s rules: - - record: osparc_osparc_cpu_seconds_per_node + - record: osparc_cpu_seconds_per_node expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) - name: osparc_node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks interval: 180s From d461c002e03fe137d8bf456abb0064398326e4e5 Mon Sep 17 00:00:00 2001 From: kaiser Date: Fri, 17 Nov 2023 11:45:22 +0100 Subject: [PATCH 7/7] fix typo --- services/monitoring/prometheus/prometheus.rules.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/monitoring/prometheus/prometheus.rules.yml b/services/monitoring/prometheus/prometheus.rules.yml index e4c90be8..25e0b18a 100644 --- a/services/monitoring/prometheus/prometheus.rules.yml +++ b/services/monitoring/prometheus/prometheus.rules.yml @@ -1,11 +1,11 @@ groups: - - name: osparc_webserver_services_started_total-sum_by_key_tag + - name: osparc_webserver_services_started_total_sum_by_key_tag rules: - - record: osparc_webserver_services_started_total_sum_by_service_key_service_tag + - record: osparc_webserver_services_started_total_sum_by_key_tag expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_webserver_services_started_total) - - name: osparc_director_services_started_total-sum_by_key_tag + - name: osparc_director_services_started_total_sum_by_key_tag rules: - - record: osparc_director_services_started_total_sum_by_service_key_service_tag + - record: osparc_director_services_started_total_sum_by_key_tag expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_director_services_started_total) - name: osparc_cpu_usage_per_node interval: 60s