diff --git a/services/MachineLearningServices/workspaces/alerts.yaml b/services/MachineLearningServices/workspaces/alerts.yaml index f6e53b96d..d8a39f869 100644 --- a/services/MachineLearningServices/workspaces/alerts.yaml +++ b/services/MachineLearningServices/workspaces/alerts.yaml @@ -18,3 +18,66 @@ criterionType: StaticThresholdCriterion threshold: 0.0 guid: c897902c-40a5-497b-a0ce-86c3eda7c61d +- name: Model Deploy Failed + description: Number of model deployments that failed in this workspace. + type: Metric + verified: true + visible: true + tags: + - manual + properties: + metricName: Model Deploy Failed + metricNamespace: Microsoft.MachineLearningServices/workspaces + severity: 3 + windowSize: PT5M + evaluationFrequency: PT1M + timeAggregation: Total + operator: GreaterThan + criterionType: StaticThresholdCriterion + threshold: 0.0 + references: + - name: Monitor Azure Machine Learning + url: https://learn.microsoft.com/en-us/azure/machine-learning/monitor-azure-machine-learning?view=azureml-api-2#machine-learning-alert-rules + guid: 0337a76f-238e-4d4d-9cd1-48b205874dbb +- name: Quota Utilization Percentage + description: Percent of quota utilized. + type: Metric + verified: true + visible: true + tags: + - manual + properties: + metricName: Quota Utilization Percentage + metricNamespace: Microsoft.MachineLearningServices/workspaces + severity: 3 + windowSize: PT5M + evaluationFrequency: PT1M + timeAggregation: Average + operator: GreaterThan + criterionType: StaticThresholdCriterion + threshold: 90.0 + references: + - name: Monitor Azure Machine Learning + url: https://learn.microsoft.com/en-us/azure/machine-learning/monitor-azure-machine-learning?view=azureml-api-2#machine-learning-alert-rules + guid: be3f1bfc-c21a-4399-9b9f-a33ebdc470cb +- name: Unusable Nodes + description: Number of unusable nodes. Unusable nodes are not functional due to some unresolvable issue. Azure will recycle these nodes. + type: Metric + verified: true + visible: true + tags: + - manual + properties: + metricName: Unusable Nodes + metricNamespace: Microsoft.MachineLearningServices/workspaces + severity: 3 + windowSize: PT5M + evaluationFrequency: PT1M + timeAggregation: Total + operator: GreaterThan + criterionType: StaticThresholdCriterion + threshold: 0.0 + references: + - name: Monitor Azure Machine Learning + url: https://learn.microsoft.com/en-us/azure/machine-learning/monitor-azure-machine-learning?view=azureml-api-2#machine-learning-alert-rules + guid: a171bc0c-676f-464b-a7b5-e50cd6c612a2