Skip to content

Commit

Permalink
Rename and purge prometheus rules, add common prefixes s4l_ and osparc_
Browse files Browse the repository at this point in the history
  • Loading branch information
mrnicegyu11 committed Nov 16, 2023
1 parent 085efab commit 5dfa686
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "avg(production_members_in_gid_135745)",
"expr": "avg(osparc_production_members_in_gid_135745)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "avg(production_members_in_gid_1)",
"expr": "avg(osparc_production_members_in_gid_1)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
10 changes: 5 additions & 5 deletions services/monitoring/pgsql_query_exporter_config.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ databases:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}
dsn: postgresql://{{POSTGRES_USER}}:{{MONITORED_POSTGRES_PASSWORDS.split(",")[loop.index0]}}@{{_stack + '_postgres' if 'rds.amazonaws.com' not in POSTGRES_HOST else POSTGRES_HOST}}:{{POSTGRES_PORT}}/{{POSTGRES_DB}}{% endfor %}

metrics:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% for _gid in MONITORING_PROMETHEUS_PGSQL_GID_MONITORED.split(",") if _gid != "" %}
{{_stack}}_members_in_gid_{{_gid}}:
osparc_{_stack}}_members_in_gid_{{_gid}}:
type: gauge
description: Number of users in group {{_gid}}{% endfor %}
{{_stack}}_total_number_of_users:
osparc_{{_stack}}_total_number_of_users:
type: gauge
description: Total nuber of registered users{% endfor %}

Expand All @@ -15,7 +15,7 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f
query_{{_stack}}_members_in_gid_{{_gid}}:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_members_in_gid_{{_gid}}]
metrics: [osparc_{{_stack}}_members_in_gid_{{_gid}}]
sql: |
SELECT COUNT(*) as {{_stack}}_members_in_gid_{{_gid}}
FROM users
Expand All @@ -25,14 +25,14 @@ queries:{% for _stack in MONITORED_STACK_NAMES.split(",") if _stack != "" %}{% f
query_{{_stack}}_total_number_of_users:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_total_number_of_users]
metrics: [osparc_{{_stack}}_total_number_of_users]
sql: |
SELECT COUNT(*) as {{_stack}}_total_number_of_users
FROM users;
query_{{_stack}}_total_number_of_users_excluding_guests:
interval: 55
databases: [postgres_{{_stack}}]
metrics: [{{_stack}}_total_number_of_users]
metrics: [osparc_{{_stack}}_total_number_of_users]
sql: |
SELECT COUNT(*) as {{_stack}}_total_number_of_users
FROM users WHERE role <> 'GUEST';{% endfor %}
5 changes: 2 additions & 3 deletions services/monitoring/prometheus/prometheus-federation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ scrape_configs:

params:
'match[]': # The match[] parameter is used to select the metrics to retrieve. Logical OR is used between match[] parameters.
- '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 'http_'
- '{__name__=~"up"}' # Regex to select the metric 'up'
- '{job="dy-sidecar-exporter"}'
- '{__name__=~"s4l_.*"}' # Regex to select all metrics that start with 's4l_'
- '{__name__=~"osparc_.*"}' # Regex to select all metrics that start with 'osparc_'

static_configs:
- targets: ['prometheuscatchall:9090'] # The targets section is where you specify the host and port of the Prometheus instances to scrape.
62 changes: 26 additions & 36 deletions services/monitoring/prometheus/prometheus.rules.yml
Original file line number Diff line number Diff line change
@@ -1,49 +1,39 @@
groups:
- name: http_requests_total-rate-5min
- name: osparc_webserver_services_started_total-sum_by_key_tag
rules:
- record: http_requests_total:rate5m
expr: rate(http_requests_total[5m])

- name: container_tasks_state-count_by_image
rules:
- record: container_tasks_state:count_by_image
expr: count by (image)(container_tasks_state{state="running", image=~".*/simcore/services/.*"})

- name: simcore_simcore_service_webserver_services_started_total-sum_by_key_tag
rules:
- record: simcore_simcore_service_webserver_services_started_total:sum_by_service_key_service_tag
- record: osparc_webserver_services_started_total_sum_by_service_key_service_tag
expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_webserver_services_started_total)

- name: simcore_simcore_service_director_services_started_total-sum_by_key_tag
- name: osparc_director_services_started_total-sum_by_key_tag
rules:
- record: simcore_simcore_service_director_services_started_total:sum_by_service_key_service_tag
- record: osparc_director_services_started_total_sum_by_service_key_service_tag
expr: sum by (service_key, service_tag, deployment)(simcore_simcore_service_director_services_started_total)
- name: node_cpu_seconds_total-nonidle-sum_over_nodes
rules:
- record: node_cpu_seconds_total:nonidle_sum_over_nodes
expr: sum(node_cpu_seconds_total{mode!="idle"})
- name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks
interval: 1h
rules:
- record: node_cpu_seconds_total:nonidle_increase_over_nodes_12weeks
expr: sum(increase(node_cpu_seconds_total{mode!="idle"}[12w]))
- name: cpu_usage_per_simcore_service
interval: 120s
rules:
- record: osparc_metrics:cpu_usage_per_simcore_service
expr: sum by (service_name, instance, node_name) (label_replace(irate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=~".*simcore.*"}[1m]), "service_name", "$1", "container_label_com_docker_swarm_service_name", ".*_(.*)")) * 100
- name: cpu_usage_per_node
- name: osparc_cpu_usage_per_node
interval: 60s
rules:
- record: osparc_metrics:cpu_usage_per_node_percentage
- record: osparc_cpu_usage_per_node_percentage
expr: 100 - (avg(irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[50s])) by (instance,node_name) * 100)
- name: cpu_seconds_per_node
- name: osparc_cpu_seconds_per_node
interval: 60s
rules:
- record: osparc_metrics:cpu_seconds_per_node
- record: osparc_osparc_cpu_seconds_per_node
expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"})
- name: node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks-v2
- name: osparc_node_cpu_seconds_total-nonidle-increase-over-nodes-12weeks
interval: 180s
rules:
- record: node_cpu_seconds_total_v2:nonidle_increase_over_nodes_12weeks_v2
expr: sum(increase(osparc_metrics:cpu_seconds_per_node[12w]))
- record: osparc_node_cpu_seconds_total_nonidle_increase_over_nodes_12weeks
expr: sum(increase(osparc_cpu_seconds_per_node[12w]))
- name: osparc_container_instances_s4lcorelite
interval: 60s
rules:
- record: osparc_cpu_usage_per_node_percentage
expr: count(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}) OR clamp_max(absent(container_memory_usage_bytes{image=~"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$",container_label_simcore_user_agent!="puppeteer"}),0)
- name: osparc_autoscaling_machines_active
interval: 60s
rules:
- record: osparc_autoscaling_machines_active
expr: sum(count_values("instance",node_exporter_build_info{instance=~"^ip-.*$"}))
- name: osparc_autoscaling_machines_buffer
interval: 60s
rules:
- record: osparc_autoscaling_machines_buffer
expr: sum(swarm_node_info{instance=~"^ip-.*$"}) - ( sum(count_values("instance",node_boot_time_seconds{instance=~"^ip-.*$"})) OR clamp_max(absent(node_boot_time_seconds{instance=~"^ip-.*$"}),0) )

0 comments on commit 5dfa686

Please sign in to comment.