Add integration for Anyscale (#18643)

* Add integration for Anyscale * fix manifest * add labeler config * more manifest fixes * Apply suggestions from code review Co-authored-by: Austin Lai <[email protected]> * Apply suggestions from code review * Update anyscale/manifest.json * Update anyscale/manifest.json --------- Co-authored-by: dkirov-dd <[email protected]> Co-authored-by: Austin Lai <[email protected]>
DataDog · Sep 27, 2024 · 6aa3c4d · 6aa3c4d
1 parent 965ba2d
commit 6aa3c4d
Show file tree

Hide file tree

Showing 10 changed files with 1,841 additions and 0 deletions.
diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml
@@ -55,6 +55,8 @@ integration/amazon_msk:
 - amazon_msk/**/*
 integration/ambari:
 - ambari/**/*
+integration/anyscale:
+- anyscale/**/*
 integration/apache:
 - apache/**/*
 integration/appgate_sdp:

diff --git a/anyscale/CHANGELOG.md b/anyscale/CHANGELOG.md
@@ -0,0 +1,7 @@
+# CHANGELOG - Anyscale
+
+## 1.0.0 / 2024-09-21
+
+***Added***:
+
+* Initial Release
diff --git a/anyscale/README.md b/anyscale/README.md
@@ -0,0 +1,24 @@
+# Agent Check: Anyscale
+
+## Overview
+
+This check monitors [Anyscale][1]. Anyscale is a compute platform that hosts [Ray][5] clusters.
+
+## Setup
+
+To install and configure the Datadog Agent with the Ray integration on Anyscale, see the [official Anyscale documentation][6].
+
+## Data Collected
+
+See the [Ray integration's Data Collected section][4].
+
+## Troubleshooting
+
+Need help? Contact [Datadog support][3].
+
+[1]: https://docs.anyscale.com/
+[2]: https://app.datadoghq.com/account/settings/agent/latest
+[3]: https://docs.datadoghq.com/help/
+[4]: https://docs.datadoghq.com/integrations/ray/?tab=host#data-collected
+[5]: https://docs.ray.io/en/latest/
+[6]: https://docs.anyscale.com/monitoring/datadog/
diff --git a/anyscale/assets/dashboards/anyscale_overview.json b/anyscale/assets/dashboards/anyscale_overview.json
diff --git a/anyscale/assets/monitors/cpu_utilization.json b/anyscale/assets/monitors/cpu_utilization.json
@@ -0,0 +1,31 @@
+{
+    "version": 2,
+    "created_at": "2023-12-13",
+    "last_updated_at": "2023-12-13",
+    "title": "High CPU Utilization on Anyscale node",
+    "description": "Running machine learning or data processing workloads with Anyscale can be computationally intensive. This monitor notifies you when the CPU utilization on a Anyscale node is high.",
+    "definition": {
+        "id": 136348522,
+        "name": "CPU Utilization maxing out on Anyscale node",
+        "type": "query alert",
+        "query": "avg(last_5m):max:ray.node.cpu_utilization{*} by {host} > 98",
+        "message": "{{#is_alert}}\nCPU Utilization is at {{value}}%. Please check resource provisioning. This occurred on node: {{nodeaddress.name}}. \n{{/is_alert}}\n\n##Please note that you can use a similar query for GPU Utilization\n##replace the above metric with ray.node.gram_used or ray.node.gpus_utilization",
+        "tags": [],
+        "options": {
+            "thresholds": {
+                "critical": 98
+            },
+            "notify_audit": false,
+            "include_tags": true,
+            "new_group_delay": 60,
+            "notify_no_data": false,
+            "avalanche_window": 10,
+            "silenced": {}
+        },
+        "priority": null,
+        "restricted_roles": null
+    },
+    "tags": [
+        "integration:ray"
+    ]
+}
diff --git a/anyscale/assets/monitors/failed_task.json b/anyscale/assets/monitors/failed_task.json
@@ -0,0 +1,33 @@
+{
+    "version": 2,
+    "created_at": "2023-12-13",
+    "last_updated_at": "2023-12-13",
+    "title": "High Number of Failed Tasks on Anyscale Node",
+    "description": "Tasks are scheduled to workers by the Anyscale scheduler. This monitor alerts when there are too many scheduling failures within a specified time period due to a particular reason.",
+    "definition": {
+        "id": 136348417,
+        "name": "High Number of Failed Tasks on Anyscale Node",
+        "type": "query alert",
+        "query": "avg(last_5m):avg:ray.scheduler.failed_worker_startup{*} by {reason,nodeaddress} > 25",
+        "message": "{{#is_alert}}\nThere are {{value}} tasks that failed to be schedule because workers were not available. This occurred on node: {{nodeaddress.name}}. The stated reason was {{reason.name}}.\n{{/is_alert}}",
+        "tags": [],
+        "options": {
+            "thresholds": {
+                "critical": 25
+            },
+            "notify_audit": false,
+            "include_tags": true,
+            "new_group_delay": 60,
+            "renotify_interval": 0,
+            "escalation_message": "",
+            "notify_no_data": false,
+            "avalanche_window": 10,
+            "silenced": {}
+        },
+        "priority": null,
+        "restricted_roles": null
+    },
+    "tags": [
+        "integration:anyscale"
+    ]
+}
diff --git a/anyscale/assets/monitors/gpu_utilization.json b/anyscale/assets/monitors/gpu_utilization.json
@@ -0,0 +1,31 @@
+{
+    "version": 2,
+    "created_at": "2023-12-13",
+    "last_updated_at": "2023-12-13",
+    "title": "GPU Utilization low on Anyscale Node",
+    "description": "Ray can leverage the computing power of GPUs to perform machine learning or data processing tasks. Underutilizing available GPU can be expensive. This monitor alerts when the GPU utilization is low on a Anyscale node indicating overprovisioning.",
+    "definition": {
+        "id": 136350024,
+        "name": "GPU Utilization low on Anyscale Node",
+        "type": "query alert",
+        "query": "avg(last_5m):max:ray.node.gpus_utilization{*} by {host} < 30",
+        "message": "{{#is_alert}}\nGPU Utilization is under {{value}}%. Your system may be overprovisioned. This occurred on node: {{nodeaddress.name}}. \n{{/is_alert}}",
+        "tags": [],
+        "options": {
+            "thresholds": {
+                "critical": 30
+            },
+            "notify_audit": false,
+            "include_tags": true,
+            "new_group_delay": 60,
+            "notify_no_data": false,
+            "avalanche_window": 10,
+            "silenced": {}
+        },
+        "priority": null,
+        "restricted_roles": null
+    },
+    "tags": [
+        "integration:anyscale"
+    ]
+}
diff --git a/anyscale/assets/monitors/mem_utilization.json b/anyscale/assets/monitors/mem_utilization.json
@@ -0,0 +1,32 @@
+{
+    "version": 2,
+    "created_at": "2023-12-13",
+    "last_updated_at": "2023-12-13",
+    "title": "High Memory Usage",
+    "description": "Running Anyscale machine learning or data processing workloads can be computationally intensive. This monitor notifies you when the Anyscale node is running low on available memory.",
+    "definition": {
+        "id": 136348497,
+        "name": "High Memory Usage on Anyscale Node",
+        "type": "query alert",
+        "query": "avg(last_5m):100 * avg:ray.node.mem.available{*} / avg:ray.node.mem.total{*} < 5",
+        "message": "{{#is_alert}} \nThere is less than {{value}}% memory available.\n\n{{/is_alert}}\n\n{{#is_warning}}\nThere is less than {{value}}% memory available.\n\n{{/is_warning}}",
+        "tags": [],
+        "options": {
+            "thresholds": {
+                "critical": 5,
+                "warning": 10
+            },
+            "notify_audit": false,
+            "include_tags": false,
+            "notify_no_data": false,
+            "avalanche_window": 10,
+            "new_host_delay": 300,
+            "silenced": {}
+        },
+        "priority": null,
+        "restricted_roles": null
+    },
+    "tags": [
+        "integration:anyscale"
+    ]
+}
diff --git a/anyscale/assets/service_checks.json b/anyscale/assets/service_checks.json
@@ -0,0 +1 @@
+[]
diff --git a/anyscale/manifest.json b/anyscale/manifest.json
@@ -0,0 +1,48 @@
+{
+  "manifest_version": "2.0.0",
+  "app_uuid": "e3d14556-99b3-4c17-a51a-4d66ef622601",
+  "app_id": "anyscale",
+  "display_on_public_website": false,
+  "tile": {
+    "overview": "README.md#Overview",
+    "configuration": "README.md#Setup",
+    "support": "README.md#Support",
+    "changelog": "CHANGELOG.md",
+    "description": "Monitor the health and performance of Anyscale",
+    "title": "Anyscale",
+    "media": [],
+    "classifier_tags": [
+      "Supported OS::Linux",
+      "Supported OS::Windows",
+      "Supported OS::macOS",
+      "Category::AI/ML",
+      "Offering::Integration",
+      "Queried Data Type::Metrics"
+    ]
+  },
+  "assets": {
+    "integration": {
+      "auto_install": true,
+      "source_type_id": 27164037,
+      "source_type_name": "Anyscale",
+      "service_checks": {
+        "metadata_path": "assets/service_checks.json"
+      }
+    },
+    "dashboards": {
+      "Anyscale Overview": "assets/dashboards/anyscale_overview.json"
+    },
+    "monitors": {
+      "High CPU Utilization on Anyscale node": "assets/monitors/cpu_utilization.json",
+      "Low GPU Utilization low on Anyscale Node": "assets/monitors/gpu_utilization.json",
+      "High Memory Usage": "assets/monitors/mem_utilization.json",
+      "High Number of Failed Tasks on Anyscale Node": "assets/monitors/failed_task.json"
+    }
+  },
+  "author": {
+    "support_email": "[email protected]",
+    "name": "Datadog",
+    "homepage": "https://www.datadoghq.com",
+    "sales_email": "[email protected]"
+  }
+}