diff --git a/docs/content/patterns/artificial intelligence/Use Cases/index.md b/docs/content/patterns/artificial intelligence/Use Cases/index.md new file mode 100644 index 000000000..d831963cd --- /dev/null +++ b/docs/content/patterns/artificial intelligence/Use Cases/index.md @@ -0,0 +1,7 @@ +# Overview + +There are numerous ways to implement AI solution on Azure, and each comes with its own monitoring solution. Monitoring AI solutions involves a combination of the infra or paas resources, along with monitoring any utilization metrics that can be exposed through the platform or other tooling. This page will summarize the recommended monitoring solutions for different scenarios. + +## AI on Infrastructure (BYOM) + +Running AI workloads on Azure infrastructure involves monitoring each of the components of the solution, including virtual machines, storage, and networking. Refer to the defined metrics in [HPC](../../specialized/hpc/Alerting-and-Monitoring.md). For monitoring the GPU/CPU metrics, use [Moneo](https://github.com/Azure/Moneo) diff --git a/docs/content/patterns/artificial intelligence/index.md b/docs/content/patterns/artificial intelligence/index.md new file mode 100644 index 000000000..dcab696ae --- /dev/null +++ b/docs/content/patterns/artificial intelligence/index.md @@ -0,0 +1,4 @@ +--- +title: Artificial Intelligence +geekdocCollapseSection: true +--- diff --git a/docs/content/patterns/specialized/hpc/_index.md b/docs/content/patterns/specialized/hpc/_index.md index 2ce6660d2..786f60911 100644 --- a/docs/content/patterns/specialized/hpc/_index.md +++ b/docs/content/patterns/specialized/hpc/_index.md @@ -11,14 +11,10 @@ High Performance Compute supports a variety of workloads. Seismic modeling, flui * Azure Batch Service * Azure NetApp Files * Azure Blob Storage -* Azure Managed Lustre Filesystem - Coming Soon! +* Azure Managed Lustre Filesystem Please note that an HPC Landing Zone is built on top of the best practices of the Azure Landing Zone. The approach for broader monitoring and alerting in the context of the Azure Landing Zone can be found [here](https://azure.github.io/azure-monitor-baseline-alerts/patterns/alz/Monitoring-and-Alerting/). -## Azure High Performance Computing on Demand +## Azure CycleCloud Workspace for Slurm -[Azure High Performance Computing on Demand (Az-HOP)](https://learn.microsoft.com/azure/cloud-adoption-framework/scenarios/azure-hpc/azure-hpc-landing-zone-accelerator) is our HPC Landing Zone accelerator. It provides Grafana Dashboards to monitor you cluster. It uses [Azure CycleCloud](https://learn.microsoft.com/azure/cyclecloud/overview?view=cyclecloud-8) as a scheduler. - -## GPU Monitoring - -We are working on explicit GPU metrics to monitor to for HPC/AI workloads. Until then, Azure HPC Ubuntu VMs come with [Moneo](https://github.com/Azure/Moneo) +[Azure CycleCloud Workspace for Slurm](https://learn.microsoft.com/azure/cyclecloud/overview-ccws?view=cyclecloud-8) is our HPC Landing Zone accelerator. diff --git a/services/NetApp/netAppAccounts/alerts.yaml b/services/NetApp/netAppAccounts/alerts.yaml index ca5575baa..c2e26f20e 100644 --- a/services/NetApp/netAppAccounts/alerts.yaml +++ b/services/NetApp/netAppAccounts/alerts.yaml @@ -6,6 +6,7 @@ tags: - auto-generated - agc-19726 + - hpc properties: metricName: VolumeConsumedSizePercentage metricNamespace: Microsoft.NetApp/netAppAccounts/capacityPools/volumes @@ -25,6 +26,7 @@ tags: - auto-generated - agc-1914 + - hpc properties: metricName: VolumeLogicalSize metricNamespace: Microsoft.NetApp/netAppAccounts/capacityPools/volumes @@ -87,6 +89,7 @@ tags: - auto-generated - agc-374 + - hpc properties: metricName: AverageReadLatency metricNamespace: Microsoft.NetApp/netAppAccounts/capacityPools/volumes @@ -107,6 +110,7 @@ tags: - auto-generated - agc-305 + - hpc properties: metricName: CbsVolumeOperationComplete metricNamespace: Microsoft.NetApp/netAppAccounts/capacityPools/volumes @@ -126,6 +130,7 @@ tags: - auto-generated - agc-301 + - hpc properties: metricName: VolumeAllocatedSize metricNamespace: Microsoft.NetApp/netAppAccounts/capacityPools/volumes diff --git a/services/StorageCache/AmlFilesystems/_index.md b/services/StorageCache/AmlFilesystems/_index.md index 85e0cdb67..0b2960abc 100644 --- a/services/StorageCache/AmlFilesystems/_index.md +++ b/services/StorageCache/AmlFilesystems/_index.md @@ -1,5 +1,7 @@ --- title: amlFilesystems geekdocCollapseSection: true -geekdocHidden: true +geekdocHidden: false --- + +{{< alertList name="alertList" >}} diff --git a/services/StorageCache/AmlFilesystems/alerts.yaml b/services/StorageCache/AmlFilesystems/alerts.yaml index e8557d091..aa705941e 100644 --- a/services/StorageCache/AmlFilesystems/alerts.yaml +++ b/services/StorageCache/AmlFilesystems/alerts.yaml @@ -1,38 +1,478 @@ -- name: TotalClientIOPS - description: Total number of client input/output operations per second - type: Metric - verified: false - visible: false +- name: OST Files Used + description: Log an alert if OSTFilesUsed is above 85% + type: Log + verified: true + visible: true tags: - hpc properties: - metricName: ClientIOPS - metricNamespace: Microsoft.StorageCache/amlFilesystems - severity: 3 - windowSize: PT5M - evaluationFrequency: PT1M - timeAggregation: Total + severity: 2 operator: GreaterThan - criterionType: StaticThresholdCriterion - threshold: 10000 - autoMitigate: false - guid: ff3df303-c3b0-449c-840d-68084707773e -- name: AverageClientLatency - description: Average latency for client operations + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 85 + metricMeasureColumn: AggregatedValue + dimensions: + - name: UsedRatio + operator: Include + values: + - '*' + - name: OSTFilesUsed + operator: Include + values: + - '*' + - name: OSTFilesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_used = 0.85; + + AzureMetrics + + | where MetricName == "OSTFilesTotal" or MetricName == "OSTFilesUsed" + + | summarize + + OSTFilesTotal = maxif(Total, MetricName == "OSTFilesTotal"), + + OSTFilesUsed = maxif(Total, MetricName == "OSTFilesUsed") + + | extend UsedRatio = OSTFilesUsed / OSTFilesTotal + + | where UsedRatio > threshold_used + + | project UsedRatio, OSTFilesUsed, OSTFilesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: 9d086772-1887-4893-8b9f-7e5169398bae + references: +- name: OST Files Free + description: Log an alert if OSTFilesFree is below 15% + type: Log + verified: true + visible: true + tags: + - hpc + properties: + severity: 2 + operator: LessThan + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 15 + metricMeasureColumn: AggregatedValue + dimensions: + - name: FreeRatio + operator: Include + values: + - '*' + - name: OSTFilesFree + operator: Include + values: + - '*' + - name: OSTFilesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_free = 0.15; + + AzureMetrics + + | where MetricName == "OSTFilesFree" or MetricName == "OSTFilesTotal" + + | summarize + + OSTFilesFree = maxif(Total, MetricName == "OSTFilesFree"), + + OSTFilesTotal = maxif(Total, MetricName == "OSTFilesTotal") + + | extend FreeRatio = OSTFilesFree / OSTFilesTotal + + | where FreeRatio < threshold_free + + | project FreeRatio, OSTFilesFree, OSTFilesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: 8f231351-c123-4e4c-8631-9978e641a3ca + references: +- name: OST Bytes Available + description: Log an alert if OSTBytesAvailable is below 15% + type: Log + verified: true + visible: true + tags: + - hpc + properties: + severity: 2 + operator: LessThan + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 15 + metricMeasureColumn: AggregatedValue + dimensions: + - name: AvailableRatio + operator: Include + values: + - '*' + - name: OSTBytesAvailable + operator: Include + values: + - '*' + - name: OSTBytesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_free = 0.15; + + AzureMetrics + + | where MetricName == "OSTBytesAvailable" or MetricName == "OSTBytesTotal" + + | summarize + + OSTBytesAvailable = maxif(Total, MetricName == "OSTBytesAvailable"), + + OSTBytesTotal = maxif(Total, MetricName == "OSTBytesTotal") + + | extend AvailableRatio = OSTBytesAvailable / OSTBytesTotal + + | where AvailableRatio < threshold_available + + | project AvailableRatio, OSTBytesAvailable, OSTBytesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: 4eeca790-a804-4453-b339-73ea425610bc + references: +- name: OST Bytes Used + description: Log an alert if OSTByteUsed is above 85% + type: Log + verified: true + visible: true + tags: + - hpc + properties: + severity: 2 + operator: GreaterThan + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 85 + metricMeasureColumn: AggregatedValue + dimensions: + - name: UsedRatio + operator: Include + values: + - '*' + - name: OSTBytesUsed + operator: Include + values: + - '*' + - name: OSTBytesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_used = 0.85; + + AzureMetrics + + | where MetricName == "OSTBytesTotal" or MetricName == "OSTBytesUsed" + + | summarize + + OSTBytesTotal = maxif(Total, MetricName == "OSTBytesTotal"), + + OSTBytesUsed = maxif(Total, MetricName == "OSTBytesUsed") + + | extend UsedRatio = OSTBytesUsed / OSTBytesTotal + + | where UsedRatio > threshold_used + + | project UsedRatio, OSTBytesUsed, OSTBytesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: 59298086-ec77-4f47-b2ef-b853b79e31cb + references: +- name: MDT Files Free + description: Log an alert if MDTFilesFree is below 15% + type: Log + verified: true + visible: true + tags: + - hpc + properties: + severity: 2 + operator: LessThan + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 15 + metricMeasureColumn: AggregatedValue + dimensions: + - name: FreeRatio + operator: Include + values: + - '*' + - name: MDTFilesFree + operator: Include + values: + - '*' + - name: MDTFilesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_used = 0.15; + + AzureMetrics + + | where MetricName == "MDTFilesFree" or MetricName == "MDTFilesTotal" + + | summarize + + MDTFilesFree = maxif(Total, MetricName == "MDTFilesFree"), + + MDTFilesTotal = maxif(Total, MetricName == "MDTFilesTotal") + + | extend FreeRatio = MDTFilesFree / MDTFilesTotal + + | where FreeRatio < threshold_free + + | project FreeRatio, MDTFilesFree, MDTFilesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: 2feba8fd-ff1e-4f48-bc01-6e2996edafa6 + references: +- name: MDT Files Used + description: Log an alert if MDTFilesUsed is above 85% + type: Log + verified: true + visible: true + tags: + - hpc + properties: + severity: 2 + operator: GreaterThan + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 85 + metricMeasureColumn: AggregatedValue + dimensions: + - name: UsedRatio + operator: Include + values: + - '*' + - name: MDTFilesUsed + operator: Include + values: + - '*' + - name: MDTFilesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_used = 0.85; + + AzureMetrics + + | where MetricName == "MDTFilesTotal" or MetricName == "MDTFilesUsed" + + | summarize + + MDTFilesTotal = maxif(Total, MetricName == "MDTFilesTotal"), + + MDTFilesUsed = maxif(Total, MetricName == "MDTFilesUsed") + + | extend FreeRatio = MDTFilesFree / MDTFilesTotal + + | where UsedRatio > threshold_used + + | project UsedRatio, MDTFilesUsed, MDTFilesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: 48fc094d-8a00-4d3c-86d3-3230c7e5881a + references: +- name: MDT Files Available + description: Log an alert if MDTBytesAvailable is below 15% + type: Log + verified: true + visible: true + tags: + - hpc + properties: + severity: 2 + operator: LessThan + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 15 + metricMeasureColumn: AggregatedValue + dimensions: + - name: AvailableRatio + operator: Include + values: + - '*' + - name: MDTBytesAvailable + operator: Include + values: + - '*' + - name: MDTBytesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_used = 0.15; + + AzureMetrics + + | where MetricName == "MDTBytesAvailable" or MetricName == "MDTBytesTotal" + + | summarize + + MDTBytesAvailable = maxif(Total, MetricName == "MDTBytesAvailable"), + + MDTBytesTotal = maxif(Total, MetricName == "MDTBytesTotal") + + | extend AvailableRatio = MDTBytesAvailable / MDTBytesTotal + + | where AvailableRatio < threshold_available + + | project AvailableRatio, MDTBytesAvailable, MDTBytesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: ecec6f93-af7e-4071-b35d-cd70b3f16581 + references: +- name: MDT Bytes Used + description: Log an alert if MDTBytesUsed is above 85% + type: Log + verified: true + visible: true + tags: + - hpc + properties: + severity: 2 + operator: GreaterThan + timeAggregation: Average + windowSize: PT1M + evaluationFrequency: PT5M + threshold: 85 + metricMeasureColumn: AggregatedValue + dimensions: + - name: UsedRatio + operator: Include + values: + - '*' + - name: MDTBytesUsed + operator: Include + values: + - '*' + - name: MDTBytesTotal + operator: Include + values: + - '*' + failingPeriods: + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + query: ' + + let threshold_used = 0.85; + + AzureMetrics + + | where MetricName == "MDTBytesTotal" or MetricName == "MDTBytesUsed" + + | summarize + + MDTBytesTotal = maxif(Total, MetricName == "MDTBytesTotal"), + + MDTBytesUsed = maxif(Total, MetricName == "MDTBytesUsed") + + | extend UsedRatio = MDTBytesUsed / MDTBytesTotal + + | where UsedRatio > threshold_used + + | project UsedRatio, MDTBytesUsed, MDTBytesTotal + + ' + autoMitigate: true + autoResolve: true + autoResolveTime: 0:10:00 + guid: ebd68fdd-9672-43e8-b7d5-6e479210535d + references: +- name: Uptime + description: Total number of client input/output operations per second type: Metric verified: false - visible: false + visible: true tags: - hpc properties: - metricName: ClientLatency - metricNamespace: Microsoft.StorageCache/amlFilesystems - severity: 2 + metricName: Uptime + metricNamespace: Microsoft.StorageCache/caches + severity: 1 windowSize: PT5M evaluationFrequency: PT1M - timeAggregation: Average - operator: GreaterThan + timeAggregation: Total + operator: LessThan criterionType: StaticThresholdCriterion - threshold: 10 + threshold: 99 autoMitigate: false - guid: 31dead21-5454-41f0-9aab-faeabef89d67 + references: + - name: Monitor HPC Cache with metrics and alerts + url: https://learn.microsoft.com/en-us/azure/hpc-cache/metrics#metrics-page + guid: 7f951991-c6ce-4c72-9f55-7eade2c4f57c diff --git a/services/StorageCache/AmlFilesystems/templates/policy/AverageClientLatency_31dead21-5454-41f0-9aab-faeabef89d67.json b/services/StorageCache/AmlFilesystems/templates/policy/AverageClientLatency_31dead21-5454-41f0-9aab-faeabef89d67.json deleted file mode 100644 index a7a7bc583..000000000 --- a/services/StorageCache/AmlFilesystems/templates/policy/AverageClientLatency_31dead21-5454-41f0-9aab-faeabef89d67.json +++ /dev/null @@ -1,333 +0,0 @@ -{ - "type": "Microsoft.Authorization/policyDefinitions", - "apiVersion": "2021-06-01", - "name": "31dead21-5454-41f0-9aab-faeabef89d67", - "properties": { - "policyType": "Custom", - "mode": "All", - "displayName": "Deploy StorageCache amlFilesystems ClientLatency Alert", - "description": "Policy to Audit/Deploy StorageCache amlFilesystems ClientLatency Alert", - "metadata": { - "version": "1.0.0-preview", - "category": "StorageCache", - "preview": true, - "source": "https://github.com/Azure/azure-monitor-baseline-alerts/", - "alzCloudEnvironments": [ - "AzureCloud" - ], - "_deployed_by_amba": "True" - }, - "parameters": { - "severity": { - "type": "String", - "metadata": { - "displayName": "Severity", - "description": "Severity of the Alert" - }, - "allowedValues": [ - "0", - "1", - "2", - "3", - "4" - ], - "defaultValue": "2" - }, - "windowSize": { - "type": "String", - "metadata": { - "displayName": "Window Size", - "description": "Window size for the alert" - }, - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H", - "PT6H", - "PT12H", - "P1D" - ], - "defaultValue": "PT5M" - }, - "evaluationFrequency": { - "type": "String", - "metadata": { - "displayName": "Evaluation Frequency", - "description": "Evaluation frequency for the alert" - }, - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H" - ], - "defaultValue": "PT1M" - }, - "autoMitigate": { - "type": "String", - "metadata": { - "displayName": "Auto Mitigate", - "description": "Auto Mitigate for the alert" - }, - "allowedValues": [ - "true", - "false" - ], - "defaultValue": "true" - }, - "enabled": { - "type": "String", - "metadata": { - "displayName": "Alert State", - "description": "Alert state for the alert" - }, - "allowedValues": [ - "true", - "false" - ], - "defaultValue": "true" - }, - "threshold": { - "type": "String", - "metadata": { - "displayName": "Threshold", - "description": "Threshold for the alert" - }, - "defaultValue": "10" - }, - "effect": { - "type": "String", - "metadata": { - "displayName": "Effect", - "description": "Effect of the policy" - }, - "allowedValues": [ - "deployIfNotExists", - "disabled" - ], - "defaultValue": "deployIfNotExists" - }, - "MonitorDisableTagName": { - "type": "String", - "metadata": { - "displayName": "Monitoring disabled tag name", - "description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled." - }, - "defaultValue": "MonitorDisable" - }, - "MonitorDisableTagValues": { - "type": "Array", - "metadata": { - "displayName": "Monitoring disabled tag values(s)", - "description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled." - }, - "defaultValue": [ - "true", - "Test", - "Dev", - "Sandbox" - ] - } - }, - "policyRule": { - "if": { - "allOf": [ - { - "field": "type", - "equals": "Microsoft.StorageCache/amlFilesystems" - }, - { - "field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]", - "notIn": "[[parameters('MonitorDisableTagValues')]" - } - ] - }, - "then": { - "effect": "[[parameters('effect')]", - "details": { - "roleDefinitionIds": [ - "/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c" - ], - "type": "Microsoft.Insights/metricAlerts", - "existenceCondition": { - "allOf": [ - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace", - "equals": "Microsoft.StorageCache/amlFilesystems" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName", - "equals": "ClientLatency" - }, - { - "field": "Microsoft.Insights/metricalerts/scopes[*]", - "equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.StorageCache/amlFilesystems/', field('fullName'))]" - }, - { - "field": "Microsoft.Insights/metricAlerts/enabled", - "equals": "[[parameters('enabled')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/evaluationFrequency", - "equals": "[[parameters('evaluationFrequency')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/windowSize", - "equals": "[[parameters('windowSize')]" - }, - { - "field": "Microsoft.Insights/metricalerts/severity", - "equals": "[[parameters('severity')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/autoMitigate", - "equals": "[[parameters('autoMitigate')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation", - "equals": "Average" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator", - "equals": "GreaterThan" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold", - "equals": "[[if(contains(field('tags'), '_amba-ClientLatency-threshold-Override_'), field('tags._amba-ClientLatency-threshold-Override_'), parameters('threshold'))]" - } - ] - }, - "deployment": { - "properties": { - "mode": "incremental", - "template": { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "resourceName": { - "type": "String", - "metadata": { - "displayName": "resourceName", - "description": "Name of the resource" - } - }, - "resourceId": { - "type": "String", - "metadata": { - "displayName": "resourceId", - "description": "Resource ID of the resource emitting the metric that will be used for the comparison" - } - }, - "severity": { - "type": "String" - }, - "windowSize": { - "type": "String" - }, - "evaluationFrequency": { - "type": "String" - }, - "autoMitigate": { - "type": "String" - }, - "enabled": { - "type": "String" - }, - "threshold": { - "type": "String" - } - }, - "variables": {}, - "resources": [ - { - "type": "Microsoft.Insights/metricAlerts", - "apiVersion": "2018-03-01", - "name": "[[concat(parameters('resourceName'), '-ClientLatency')]", - "location": "global", - "tags": { - "_deployed_by_amba": true - }, - "properties": { - "description": "Metric Alert for StorageCache amlFilesystems ClientLatency", - "severity": "[[parameters('severity')]", - "enabled": "[[parameters('enabled')]", - "scopes": [ - "[[parameters('resourceId')]" - ], - "evaluationFrequency": "[[parameters('evaluationFrequency')]", - "windowSize": "[[parameters('windowSize')]", - "criteria": { - "allOf": [ - { - "name": "ClientLatency", - "metricNamespace": "Microsoft.StorageCache/amlFilesystems", - "metricName": "ClientLatency", - "operator": "GreaterThan", - "threshold": "[[parameters('threshold')]", - "timeAggregation": "Average", - "criterionType": "StaticThresholdCriterion" - } - ], - "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" - }, - "autoMitigate": "[[parameters('autoMitigate')]", - "parameters": { - "severity": { - "value": "[[parameters('severity')]" - }, - "windowSize": { - "value": "[[parameters('windowSize')]" - }, - "evaluationFrequency": { - "value": "[[parameters('evaluationFrequency')]" - }, - "autoMitigate": { - "value": "[[parameters('autoMitigate')]" - }, - "enabled": { - "value": "[[parameters('enabled')]" - }, - "threshold": { - "value": "[[parameters('threshold')]" - } - } - } - } - ] - }, - "parameters": { - "resourceName": { - "value": "[[field('name')]" - }, - "resourceId": { - "value": "[[field('id')]" - }, - "severity": { - "value": "[[parameters('severity')]" - }, - "windowSize": { - "value": "[[parameters('windowSize')]" - }, - "evaluationFrequency": { - "value": "[[parameters('evaluationFrequency')]" - }, - "autoMitigate": { - "value": "[[parameters('autoMitigate')]" - }, - "enabled": { - "value": "[[parameters('enabled')]" - }, - "threshold": { - "value": "[[if(contains(field('tags'), '_amba-ClientLatency-threshold-Override_'), field('tags._amba-ClientLatency-threshold-Override_'), parameters('threshold'))]" - } - } - } - } - } - } - } - } -} diff --git a/services/StorageCache/AmlFilesystems/templates/policy/TotalClientIOPS_ff3df303-c3b0-449c-840d-68084707773e.json b/services/StorageCache/AmlFilesystems/templates/policy/TotalClientIOPS_ff3df303-c3b0-449c-840d-68084707773e.json deleted file mode 100644 index 63652816a..000000000 --- a/services/StorageCache/AmlFilesystems/templates/policy/TotalClientIOPS_ff3df303-c3b0-449c-840d-68084707773e.json +++ /dev/null @@ -1,333 +0,0 @@ -{ - "type": "Microsoft.Authorization/policyDefinitions", - "apiVersion": "2021-06-01", - "name": "ff3df303-c3b0-449c-840d-68084707773e", - "properties": { - "policyType": "Custom", - "mode": "All", - "displayName": "Deploy StorageCache amlFilesystems ClientIOPS Alert", - "description": "Policy to Audit/Deploy StorageCache amlFilesystems ClientIOPS Alert", - "metadata": { - "version": "1.0.0-preview", - "category": "StorageCache", - "preview": true, - "source": "https://github.com/Azure/azure-monitor-baseline-alerts/", - "alzCloudEnvironments": [ - "AzureCloud" - ], - "_deployed_by_amba": "True" - }, - "parameters": { - "severity": { - "type": "String", - "metadata": { - "displayName": "Severity", - "description": "Severity of the Alert" - }, - "allowedValues": [ - "0", - "1", - "2", - "3", - "4" - ], - "defaultValue": "3" - }, - "windowSize": { - "type": "String", - "metadata": { - "displayName": "Window Size", - "description": "Window size for the alert" - }, - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H", - "PT6H", - "PT12H", - "P1D" - ], - "defaultValue": "PT5M" - }, - "evaluationFrequency": { - "type": "String", - "metadata": { - "displayName": "Evaluation Frequency", - "description": "Evaluation frequency for the alert" - }, - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H" - ], - "defaultValue": "PT1M" - }, - "autoMitigate": { - "type": "String", - "metadata": { - "displayName": "Auto Mitigate", - "description": "Auto Mitigate for the alert" - }, - "allowedValues": [ - "true", - "false" - ], - "defaultValue": "true" - }, - "enabled": { - "type": "String", - "metadata": { - "displayName": "Alert State", - "description": "Alert state for the alert" - }, - "allowedValues": [ - "true", - "false" - ], - "defaultValue": "true" - }, - "threshold": { - "type": "String", - "metadata": { - "displayName": "Threshold", - "description": "Threshold for the alert" - }, - "defaultValue": "10000" - }, - "effect": { - "type": "String", - "metadata": { - "displayName": "Effect", - "description": "Effect of the policy" - }, - "allowedValues": [ - "deployIfNotExists", - "disabled" - ], - "defaultValue": "deployIfNotExists" - }, - "MonitorDisableTagName": { - "type": "String", - "metadata": { - "displayName": "Monitoring disabled tag name", - "description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled." - }, - "defaultValue": "MonitorDisable" - }, - "MonitorDisableTagValues": { - "type": "Array", - "metadata": { - "displayName": "Monitoring disabled tag values(s)", - "description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled." - }, - "defaultValue": [ - "true", - "Test", - "Dev", - "Sandbox" - ] - } - }, - "policyRule": { - "if": { - "allOf": [ - { - "field": "type", - "equals": "Microsoft.StorageCache/amlFilesystems" - }, - { - "field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]", - "notIn": "[[parameters('MonitorDisableTagValues')]" - } - ] - }, - "then": { - "effect": "[[parameters('effect')]", - "details": { - "roleDefinitionIds": [ - "/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c" - ], - "type": "Microsoft.Insights/metricAlerts", - "existenceCondition": { - "allOf": [ - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace", - "equals": "Microsoft.StorageCache/amlFilesystems" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName", - "equals": "ClientIOPS" - }, - { - "field": "Microsoft.Insights/metricalerts/scopes[*]", - "equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.StorageCache/amlFilesystems/', field('fullName'))]" - }, - { - "field": "Microsoft.Insights/metricAlerts/enabled", - "equals": "[[parameters('enabled')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/evaluationFrequency", - "equals": "[[parameters('evaluationFrequency')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/windowSize", - "equals": "[[parameters('windowSize')]" - }, - { - "field": "Microsoft.Insights/metricalerts/severity", - "equals": "[[parameters('severity')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/autoMitigate", - "equals": "[[parameters('autoMitigate')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation", - "equals": "Total" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator", - "equals": "GreaterThan" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold", - "equals": "[[if(contains(field('tags'), '_amba-ClientIOPS-threshold-Override_'), field('tags._amba-ClientIOPS-threshold-Override_'), parameters('threshold'))]" - } - ] - }, - "deployment": { - "properties": { - "mode": "incremental", - "template": { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "resourceName": { - "type": "String", - "metadata": { - "displayName": "resourceName", - "description": "Name of the resource" - } - }, - "resourceId": { - "type": "String", - "metadata": { - "displayName": "resourceId", - "description": "Resource ID of the resource emitting the metric that will be used for the comparison" - } - }, - "severity": { - "type": "String" - }, - "windowSize": { - "type": "String" - }, - "evaluationFrequency": { - "type": "String" - }, - "autoMitigate": { - "type": "String" - }, - "enabled": { - "type": "String" - }, - "threshold": { - "type": "String" - } - }, - "variables": {}, - "resources": [ - { - "type": "Microsoft.Insights/metricAlerts", - "apiVersion": "2018-03-01", - "name": "[[concat(parameters('resourceName'), '-ClientIOPS')]", - "location": "global", - "tags": { - "_deployed_by_amba": true - }, - "properties": { - "description": "Metric Alert for StorageCache amlFilesystems ClientIOPS", - "severity": "[[parameters('severity')]", - "enabled": "[[parameters('enabled')]", - "scopes": [ - "[[parameters('resourceId')]" - ], - "evaluationFrequency": "[[parameters('evaluationFrequency')]", - "windowSize": "[[parameters('windowSize')]", - "criteria": { - "allOf": [ - { - "name": "ClientIOPS", - "metricNamespace": "Microsoft.StorageCache/amlFilesystems", - "metricName": "ClientIOPS", - "operator": "GreaterThan", - "threshold": "[[parameters('threshold')]", - "timeAggregation": "Total", - "criterionType": "StaticThresholdCriterion" - } - ], - "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" - }, - "autoMitigate": "[[parameters('autoMitigate')]", - "parameters": { - "severity": { - "value": "[[parameters('severity')]" - }, - "windowSize": { - "value": "[[parameters('windowSize')]" - }, - "evaluationFrequency": { - "value": "[[parameters('evaluationFrequency')]" - }, - "autoMitigate": { - "value": "[[parameters('autoMitigate')]" - }, - "enabled": { - "value": "[[parameters('enabled')]" - }, - "threshold": { - "value": "[[parameters('threshold')]" - } - } - } - } - ] - }, - "parameters": { - "resourceName": { - "value": "[[field('name')]" - }, - "resourceId": { - "value": "[[field('id')]" - }, - "severity": { - "value": "[[parameters('severity')]" - }, - "windowSize": { - "value": "[[parameters('windowSize')]" - }, - "evaluationFrequency": { - "value": "[[parameters('evaluationFrequency')]" - }, - "autoMitigate": { - "value": "[[parameters('autoMitigate')]" - }, - "enabled": { - "value": "[[parameters('enabled')]" - }, - "threshold": { - "value": "[[if(contains(field('tags'), '_amba-ClientIOPS-threshold-Override_'), field('tags._amba-ClientIOPS-threshold-Override_'), parameters('threshold'))]" - } - } - } - } - } - } - } - } -} diff --git a/services/StorageCache/_index.md b/services/StorageCache/_index.md index ee14a1cdc..58d62013f 100644 --- a/services/StorageCache/_index.md +++ b/services/StorageCache/_index.md @@ -1,5 +1,5 @@ --- -title: StorageCache +title: AmlFilesystems geekdocCollapseSection: true +geekdocHidden: false --- - diff --git a/services/StorageCache/caches/alerts.yaml b/services/StorageCache/caches/alerts.yaml deleted file mode 100644 index 69aaf2ab5..000000000 --- a/services/StorageCache/caches/alerts.yaml +++ /dev/null @@ -1,22 +0,0 @@ -- name: Uptime - description: Total number of client input/output operations per second - type: Metric - verified: false - visible: true - tags: - - hpc - properties: - metricName: Uptime - metricNamespace: Microsoft.StorageCache/caches - severity: 1 - windowSize: PT5M - evaluationFrequency: PT1M - timeAggregation: Total - operator: LessThan - criterionType: StaticThresholdCriterion - threshold: 99 - autoMitigate: false - references: - - name: Monitor HPC Cache with metrics and alerts - url: https://learn.microsoft.com/en-us/azure/hpc-cache/metrics#metrics-page - guid: 7f951991-c6ce-4c72-9f55-7eade2c4f57c diff --git a/services/StorageCache/caches/templates/arm/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.json b/services/StorageCache/caches/templates/arm/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.json deleted file mode 100644 index c174a96c2..000000000 --- a/services/StorageCache/caches/templates/arm/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.json +++ /dev/null @@ -1,199 +0,0 @@ -{ - "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "alertName": { - "type": "string", - "minLength": 1, - "metadata": { - "description": "Name of the alert" - } - }, - "alertDescription": { - "type": "string", - "defaultValue": "Total number of client input/output operations per second", - "metadata": { - "description": "Description of alert" - } - }, - "targetResourceId": { - "type": "string", - "minLength": 1, - "metadata": { - "description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name" - } - }, - "targetResourceRegion": { - "type": "string", - "metadata": { - "description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS" - } - }, - "targetResourceType": { - "type": "string", - "minLength": 1, - "metadata": { - "description": "Resource type of target resources to be monitored." - } - }, - "isEnabled": { - "type": "bool", - "defaultValue": true, - "metadata": { - "description": "Specifies whether the alert is enabled" - } - }, - "alertSeverity": { - "type": "int", - "defaultValue": 1, - "allowedValues": [ - 0, - 1, - 2, - 3, - 4 - ], - "metadata": { - "description": "Severity of alert {0,1,2,3,4}" - } - }, - "operator": { - "type": "string", - "defaultValue": "LessThan", - "allowedValues": [ - "Equals", - "GreaterThan", - "GreaterThanOrEqual", - "LessThan", - "LessThanOrEqual" - ], - "metadata": { - "description": "Operator comparing the current value with the threshold value." - } - }, - "threshold": { - "type": "string", - "defaultValue": "99", - "metadata": { - "description": "The threshold value at which the alert is activated." - } - }, - "timeAggregation": { - "type": "string", - "defaultValue": "Total", - "allowedValues": [ - "Average", - "Minimum", - "Maximum", - "Total", - "Count" - ], - "metadata": { - "description": "How the data that is collected should be combined over time." - } - }, - "windowSize": { - "type": "string", - "defaultValue": "PT5M", - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H", - "PT6H", - "PT12H", - "PT24H", - "PT1D" - ], - "metadata": { - "description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format." - } - }, - "evaluationFrequency": { - "type": "string", - "defaultValue": "PT1M", - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H" - ], - "metadata": { - "description": "how often the metric alert is evaluated represented in ISO 8601 duration format" - } - }, - "currentDateTimeUtcNow": { - "type": "string", - "defaultValue": "[utcNow()]", - "metadata": { - "description": "The current date and time using the utcNow function. Used for deployment name uniqueness" - } - }, - "telemetryOptOut": { - "type": "string", - "defaultValue": "No", - "allowedValues": [ - "Yes", - "No" - ], - "metadata": { - "description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry." - } - } - }, - "variables": { - "pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]", - "varTargetResourceId": "[split(parameters('targetResourceId'), ',')]" - }, - "resources": [ - { - "type": "Microsoft.Insights/metricAlerts", - "apiVersion": "2018-03-01", - "name": "[parameters('alertName')]", - "location": "global", - "tags": { - "_deployed_by_amba": true - }, - "properties": { - "description": "[parameters('alertDescription')]", - "scopes": "[variables('varTargetResourceId')]", - "targetResourceType": "[parameters('targetResourceType')]", - "targetResourceRegion": "[parameters('targetResourceRegion')]", - "severity": "[parameters('alertSeverity')]", - "enabled": "[parameters('isEnabled')]", - "evaluationFrequency": "[parameters('evaluationFrequency')]", - "windowSize": "[parameters('windowSize')]", - "criteria": { - "odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria", - "allOf": [ - { - "name": "1st criterion", - "metricName": "Uptime", - "dimensions": [], - "operator": "[parameters('operator')]", - "threshold": "[parameters('threshold')]", - "timeAggregation": "[parameters('timeAggregation')]", - "criterionType": "StaticThresholdCriterion" - } - ] - } - } - }, - { - "condition": "[equals(parameters('telemetryOptOut'), 'No')]", - "apiVersion": "2020-06-01", - "name": "[variables('pidDeploymentName')]", - "type": "Microsoft.Resources/deployments", - "properties": { - "mode": "Incremental", - "template": { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "resources": [] - } - } - } - ] -} diff --git a/services/StorageCache/caches/templates/bicep/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.bicep b/services/StorageCache/caches/templates/bicep/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.bicep deleted file mode 100644 index fd041082f..000000000 --- a/services/StorageCache/caches/templates/bicep/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.bicep +++ /dev/null @@ -1,135 +0,0 @@ -@description('Name of the alert') -@minLength(1) -param alertName string - -@description('Description of alert') -param alertDescription string = 'Total number of client input/output operations per second' - -@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name') -@minLength(1) -param targetResourceId array - -@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS') -param targetResourceRegion string - -@description('Resource type of target resources to be monitored.') -@minLength(1) -param targetResourceType string - -@description('Specifies whether the alert is enabled') -param isEnabled bool = true - -@description('Severity of alert {0,1,2,3,4}') -@allowed([ - 0 - 1 - 2 - 3 - 4 -]) -param alertSeverity int = 1 - -@description('Operator comparing the current value with the threshold value.') -@allowed([ - 'Equals' - 'GreaterThan' - 'GreaterThanOrEqual' - 'LessThan' - 'LessThanOrEqual' -]) -param operator string = 'LessThan' - -@description('The threshold value at which the alert is activated.') -param threshold int = 99 - -@description('How the data that is collected should be combined over time.') -@allowed([ - 'Average' - 'Minimum' - 'Maximum' - 'Total' - 'Count' -]) -param timeAggregation string = 'Total' - -@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.') -@allowed([ - 'PT1M' - 'PT5M' - 'PT15M' - 'PT30M' - 'PT1H' - 'PT6H' - 'PT12H' - 'PT24H' - 'P1D' -]) -param windowSize string = 'PT5M' - -@description('how often the metric alert is evaluated represented in ISO 8601 duration format') -@allowed([ - 'PT1M' - 'PT5M' - 'PT15M' - 'PT30M' - 'PT1H' -]) -param evaluationFrequency string = 'PT1M' - -@description('"The current date and time using the utcNow function. Used for deployment name uniqueness') -param currentDateTimeUtcNow string = utcNow() - -@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.') -@allowed([ - 'Yes' - 'No' -]) -param telemetryOptOut string = 'No' - -resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: alertName - location: 'global' - tags: { - _deployed_by_amba: 'true' - } - properties: { - description: alertDescription - scopes: targetResourceId - targetResourceType: targetResourceType - targetResourceRegion: targetResourceRegion - severity: alertSeverity - enabled: isEnabled - evaluationFrequency: evaluationFrequency - windowSize: windowSize - criteria: { - 'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria' - allOf: [ - { - name: '1st criterion' - metricName: 'Uptime' - dimensions: [[]] - operator: operator - threshold: threshold - timeAggregation: timeAggregation - criterionType: 'StaticThresholdCriterion' - } - ] - } - } -} - -var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}' -resource ambaTelemetryPid 'Microsoft.Resources/deployments@2020-06-01' = if (telemetryOptOut == 'No') { - name: ambaTelemetryPidName - tags: { - _deployed_by_amba: 'true' - } - properties: { - mode: 'Incremental' - template: { - '$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#' - contentVersion: '1.0.0.0' - resources: [] - } - } -} diff --git a/services/StorageCache/caches/templates/policy/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.json b/services/StorageCache/caches/templates/policy/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.json deleted file mode 100644 index c9d655f8f..000000000 --- a/services/StorageCache/caches/templates/policy/Uptime_7f951991-c6ce-4c72-9f55-7eade2c4f57c.json +++ /dev/null @@ -1,333 +0,0 @@ -{ - "type": "Microsoft.Authorization/policyDefinitions", - "apiVersion": "2021-06-01", - "name": "7f951991-c6ce-4c72-9f55-7eade2c4f57c", - "properties": { - "policyType": "Custom", - "mode": "All", - "displayName": "Deploy StorageCache caches Uptime Alert", - "description": "Policy to Audit/Deploy StorageCache caches Uptime Alert", - "metadata": { - "version": "1.0.0-preview", - "category": "StorageCache", - "preview": true, - "source": "https://github.com/Azure/azure-monitor-baseline-alerts/", - "alzCloudEnvironments": [ - "AzureCloud" - ], - "_deployed_by_amba": "True" - }, - "parameters": { - "severity": { - "type": "String", - "metadata": { - "displayName": "Severity", - "description": "Severity of the Alert" - }, - "allowedValues": [ - "0", - "1", - "2", - "3", - "4" - ], - "defaultValue": "1" - }, - "windowSize": { - "type": "String", - "metadata": { - "displayName": "Window Size", - "description": "Window size for the alert" - }, - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H", - "PT6H", - "PT12H", - "P1D" - ], - "defaultValue": "PT5M" - }, - "evaluationFrequency": { - "type": "String", - "metadata": { - "displayName": "Evaluation Frequency", - "description": "Evaluation frequency for the alert" - }, - "allowedValues": [ - "PT1M", - "PT5M", - "PT15M", - "PT30M", - "PT1H" - ], - "defaultValue": "PT1M" - }, - "autoMitigate": { - "type": "String", - "metadata": { - "displayName": "Auto Mitigate", - "description": "Auto Mitigate for the alert" - }, - "allowedValues": [ - "true", - "false" - ], - "defaultValue": "true" - }, - "enabled": { - "type": "String", - "metadata": { - "displayName": "Alert State", - "description": "Alert state for the alert" - }, - "allowedValues": [ - "true", - "false" - ], - "defaultValue": "true" - }, - "threshold": { - "type": "String", - "metadata": { - "displayName": "Threshold", - "description": "Threshold for the alert" - }, - "defaultValue": "99" - }, - "effect": { - "type": "String", - "metadata": { - "displayName": "Effect", - "description": "Effect of the policy" - }, - "allowedValues": [ - "deployIfNotExists", - "disabled" - ], - "defaultValue": "deployIfNotExists" - }, - "MonitorDisableTagName": { - "type": "String", - "metadata": { - "displayName": "Monitoring disabled tag name", - "description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled." - }, - "defaultValue": "MonitorDisable" - }, - "MonitorDisableTagValues": { - "type": "Array", - "metadata": { - "displayName": "Monitoring disabled tag values(s)", - "description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled." - }, - "defaultValue": [ - "true", - "Test", - "Dev", - "Sandbox" - ] - } - }, - "policyRule": { - "if": { - "allOf": [ - { - "field": "type", - "equals": "Microsoft.StorageCache/caches" - }, - { - "field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]", - "notIn": "[[parameters('MonitorDisableTagValues')]" - } - ] - }, - "then": { - "effect": "[[parameters('effect')]", - "details": { - "roleDefinitionIds": [ - "/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c" - ], - "type": "Microsoft.Insights/metricAlerts", - "existenceCondition": { - "allOf": [ - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace", - "equals": "Microsoft.StorageCache/caches" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName", - "equals": "Uptime" - }, - { - "field": "Microsoft.Insights/metricalerts/scopes[*]", - "equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.StorageCache/caches/', field('fullName'))]" - }, - { - "field": "Microsoft.Insights/metricAlerts/enabled", - "equals": "[[parameters('enabled')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/evaluationFrequency", - "equals": "[[parameters('evaluationFrequency')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/windowSize", - "equals": "[[parameters('windowSize')]" - }, - { - "field": "Microsoft.Insights/metricalerts/severity", - "equals": "[[parameters('severity')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/autoMitigate", - "equals": "[[parameters('autoMitigate')]" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation", - "equals": "Total" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator", - "equals": "LessThan" - }, - { - "field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold", - "equals": "[[if(contains(field('tags'), '_amba-Uptime-threshold-Override_'), field('tags._amba-Uptime-threshold-Override_'), parameters('threshold'))]" - } - ] - }, - "deployment": { - "properties": { - "mode": "incremental", - "template": { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "resourceName": { - "type": "String", - "metadata": { - "displayName": "resourceName", - "description": "Name of the resource" - } - }, - "resourceId": { - "type": "String", - "metadata": { - "displayName": "resourceId", - "description": "Resource ID of the resource emitting the metric that will be used for the comparison" - } - }, - "severity": { - "type": "String" - }, - "windowSize": { - "type": "String" - }, - "evaluationFrequency": { - "type": "String" - }, - "autoMitigate": { - "type": "String" - }, - "enabled": { - "type": "String" - }, - "threshold": { - "type": "String" - } - }, - "variables": {}, - "resources": [ - { - "type": "Microsoft.Insights/metricAlerts", - "apiVersion": "2018-03-01", - "name": "[[concat(parameters('resourceName'), '-Uptime')]", - "location": "global", - "tags": { - "_deployed_by_amba": true - }, - "properties": { - "description": "Metric Alert for StorageCache caches Uptime", - "severity": "[[parameters('severity')]", - "enabled": "[[parameters('enabled')]", - "scopes": [ - "[[parameters('resourceId')]" - ], - "evaluationFrequency": "[[parameters('evaluationFrequency')]", - "windowSize": "[[parameters('windowSize')]", - "criteria": { - "allOf": [ - { - "name": "Uptime", - "metricNamespace": "Microsoft.StorageCache/caches", - "metricName": "Uptime", - "operator": "LessThan", - "threshold": "[[parameters('threshold')]", - "timeAggregation": "Total", - "criterionType": "StaticThresholdCriterion" - } - ], - "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" - }, - "autoMitigate": "[[parameters('autoMitigate')]", - "parameters": { - "severity": { - "value": "[[parameters('severity')]" - }, - "windowSize": { - "value": "[[parameters('windowSize')]" - }, - "evaluationFrequency": { - "value": "[[parameters('evaluationFrequency')]" - }, - "autoMitigate": { - "value": "[[parameters('autoMitigate')]" - }, - "enabled": { - "value": "[[parameters('enabled')]" - }, - "threshold": { - "value": "[[parameters('threshold')]" - } - } - } - } - ] - }, - "parameters": { - "resourceName": { - "value": "[[field('name')]" - }, - "resourceId": { - "value": "[[field('id')]" - }, - "severity": { - "value": "[[parameters('severity')]" - }, - "windowSize": { - "value": "[[parameters('windowSize')]" - }, - "evaluationFrequency": { - "value": "[[parameters('evaluationFrequency')]" - }, - "autoMitigate": { - "value": "[[parameters('autoMitigate')]" - }, - "enabled": { - "value": "[[parameters('enabled')]" - }, - "threshold": { - "value": "[[if(contains(field('tags'), '_amba-Uptime-threshold-Override_'), field('tags._amba-Uptime-threshold-Override_'), parameters('threshold'))]" - } - } - } - } - } - } - } - } -}