From 98c19bc87809242ab8baa712308f81f1419e5958 Mon Sep 17 00:00:00 2001 From: Naman Nandan Date: Tue, 21 Nov 2023 14:21:01 -0800 Subject: [PATCH] Auto detect backend metrics not defined in metrics configuration (#2769) * Auto detect backend metrics not defined in metrics configuration * Add tests for metric auto detection * Update custom metrics example to show metrics auto detection * Update metrics documentation * Fix unit test failure * Set request ID for load model requests * Update metrics integration test to use custom metrics example * Make auto updates to frontend metrics cache thread safe * Add auto detect metrics is always called on metric log that has type information * Fix auto detect backend metrics unit test * Update documentation about performance impact of metrics auto detection * fix linter error * Disable metrics auto detection by default * fix integration tests * Update documentation and custom metrics example about metrics auto detection * Update metrics auto detection documentation * Add helper functions to get dimension names and values from Metric object * Fix java formatting * Move request id assignment for model load requests from model_service_worker.py to model_loader.py --- docs/metrics.md | 42 +-- examples/custom_metrics/README.md | 7 +- examples/custom_metrics/config.properties | 1 + examples/custom_metrics/metrics.yaml | 8 +- examples/custom_metrics/mnist_handler.py | 6 +- .../org/pytorch/serve/metrics/Metric.java | 44 ++- .../pytorch/serve/metrics/MetricCache.java | 24 +- .../serve/metrics/MetricCollector.java | 6 +- .../org/pytorch/serve/util/ConfigManager.java | 21 +- .../pytorch/serve/wlm/WorkerLifeCycle.java | 57 ++-- .../serve/metrics/MetricCacheTest.java | 22 ++ .../metrics/MetricConfigurationTest.java | 11 +- .../org/pytorch/serve/metrics/MetricTest.java | 51 +++ .../metrics/metrics_auto_detect.yaml | 81 +++++ test/pytest/test_metrics.py | 292 ++++++++++++++++-- ts/metrics/caching_metric.py | 2 +- ts/model_loader.py | 8 + ts_scripts/spellcheck_conf/wordlist.txt | 1 + 18 files changed, 573 insertions(+), 111 deletions(-) create mode 100644 test/pytest/test_data/metrics/metrics_auto_detect.yaml diff --git a/docs/metrics.md b/docs/metrics.md index 5ae60b5cbd..aefb5c167b 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -36,16 +36,25 @@ The location of log files and metric files can be configured in the [log4j2.xml] **Prometheus Mode** -In `prometheus` mode, metrics defined in the metrics configuration file are made available in prometheus format via the [metrics API endpoint](metrics_api.md). +In `prometheus` mode, metrics are made available in prometheus format via the [metrics API endpoint](metrics_api.md). ## Getting Started with TorchServe Metrics -TorchServe defines metrics in a [yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) file, including both frontend metrics (i.e. `ts_metrics`) and backend metrics (i.e. `model_metrics`). +TorchServe defines metrics configuration in a [yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) file, including both frontend metrics (i.e. `ts_metrics`) and backend metrics (i.e. `model_metrics`). When TorchServe is started, the metrics definition is loaded in the frontend and backend cache separately. -The backend emits metrics logs as they are updated. The frontend parses these logs and makes the corresponding metrics available either as logs or via the [metrics API endpoint](metrics_api.md) based on the metrics_mode configuration. +The backend emits metrics logs as they are updated. The frontend parses these logs and makes the corresponding metrics available either as logs or via the [metrics API endpoint](metrics_api.md) based on the `metrics_mode` configuration. -Dynamic updates to the metrics configuration file is currently not supported. In order to account for updates made to the metrics configuration file, Torchserve will need to be restarted. +Dynamic updates to the metrics configuration file is not supported. In order to account for updates made to the metrics configuration file, Torchserve will need to be restarted. + +By default, metrics that are not defined in the metrics configuration file will not be logged in the metrics log files or made available via the prometheus metrics API endpoint. +Backend model metrics can be `auto-detected` and registered in the frontend by setting `model_metrics_auto_detect` to `true` in `config.properties` +or using the `TS_MODEL_METRICS_AUTO_DETECT` environment variable. By default, `model_metrics_auto_detect` is disabled. + +`Warning: Using auto-detection of backend metrics will have performance impact in the form of latency overhead, typically at model load and first inference for a given model. +This cold start behavior is because, it is during model load and first inference that new metrics are typically emitted by the backend and is detected and registered by the frontend. +Subsequent inferences could also see performance impact if new metrics are updated for the first time. +For use cases where multiple models are loaded/unloaded often, the latency overhead can be mitigated by specifying known metrics in the metrics configuration file, ahead of time.` The `metrics.yaml` is formatted with Prometheus metric type terminology: @@ -87,9 +96,6 @@ model_metrics: # backend metrics dimensions: [*model_name, *level] ``` - -Note that **only** the metrics defined in the **metrics configuration file** can be emitted to model_metrics.log or made available via the [metrics API endpoint](metrics_api.md). This is done to ensure that the metrics configuration file serves as a central inventory of all the metrics that Torchserve can emit. - Default metrics are provided in the [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) file, but the user can either delete them to their liking / ignore them altogether, because these metrics will not be emitted unless they are updated.\ When adding custom `model_metrics` in the metrics configuration file, ensure to include `ModelName` and `Level` dimension names towards the end of the list of dimensions since they are included by default by the following custom metrics APIs: [add_metric](#function-api-to-add-generic-metrics-with-default-dimensions), [add_counter](#add-counter-based-metrics), @@ -175,7 +181,7 @@ Metrics collected include: ### Metric Types Enum -TorchServe Metrics is introducing [Metric Types](https://github.com/pytorch/serve/blob/master/ts/metrics/metric_type_enum.py) +TorchServe Metrics use [Metric Types](https://github.com/pytorch/serve/blob/master/ts/metrics/metric_type_enum.py) that are in line with the [Prometheus API](https://github.com/prometheus/client_python) metric types. Metric types are an attribute of Metric objects. @@ -268,14 +274,15 @@ All metrics are collected within the context. ### Specifying Metric Types -When adding any metric via Metrics API, users have the ability to override the metric type by specifying the positional argument +When adding any metric via Metrics API, users have the ability to override the default metric type by specifying the positional argument `metric_type=MetricTypes.[COUNTER/GAUGE/HISTOGRAM]`. ```python -metrics.add_metric("GenericMetric", value, unit=unit, dimension_names=["name1", "name2", ...], metric_type=MetricTypes.GAUGE) +example_metric = metrics.add_metric_to_cache(name="ExampleMetric", unit="ms", dimension_names=["name1", "name2"], metric_type=MetricTypes.GAUGE) +example_metric.add_or_update(value=1, dimension_values=["value1", "value2"]) # Backwards compatible, combines the above two method calls -metrics.add_counter("CounterMetric", value=1, dimensions=[Dimension("name", "value"), ...]) +metrics.add_metric(name="ExampleMetric", value=1, unit="ms", dimensions=[Dimension("name1", "value1"), Dimension("name2", "value2")], metric_type=MetricTypes.GAUGE) ``` @@ -302,14 +309,12 @@ given some criteria: 3. Dimensions should be the same (as well as the same order!) 1. All dimensions have to match, and Metric objects that have been parsed from the yaml file also have dimension names that are parsed from the yaml file 1. Users can [create their own](#create-dimension-objects) `Dimension` objects to match those in the yaml file dimensions - 2. if the Metric object has `ModelName` and `Level` dimensions only, it is optional to specify additional dimensions since these are considered [default dimensions](#default-dimensions), so: `add_counter('InferenceTimeInMS', value=2)` or `add_counter('InferenceTimeInMS', value=2, dimensions=["ModelName", "Level"])` + 2. If the Metric object has `ModelName` and `Level` dimensions only, it is optional to specify additional dimensions since these are considered [default dimensions](#default-dimensions), so: `add_counter('InferenceTimeInMS', value=2)` or `add_counter('InferenceTimeInMS', value=2, dimensions=["ModelName", "Level"])` ### Default dimensions -Metrics will have a couple of default dimensions if not already specified. - -If the metric is a type `Gauge`, `Histogram`, `Counter`, by default it will have: +Metrics will have a couple of default dimensions if not already specified: * `ModelName,{name_of_model}` * `Level,Model` @@ -555,7 +560,6 @@ Function API metric_type: MetricTypes type for defining different operations, defaulted to gauge metric type for Percent metrics """ - ``` **Inferred unit**: `percent` @@ -599,7 +603,7 @@ Function API ### Getting a metric -Users can get a metric from the cache. The Metric object is returned, so the user can access the methods of the Metric: (i.e. `Metric.update(value)`, `Metric.__str__`) +Users can get a metric from the cache. The CachingMetric object is returned, so the user can access the methods of the CachingMetric: (i.e. `CachingMetric.add_or_update(value, dimensions_values)`, `CachingMetric.update(value, dimensions)`) ```python def get_metric(self, metric_name: str, metric_type: MetricTypes) -> Metric: @@ -723,12 +727,12 @@ class CustomHandlerExample: ``` - **[v0.6.1 - v0.8.1] to [> v0.8.1]**\ Replace the call to `add_metric` with `add_metric_to_cache`. -2. Starting [v0.8.0](https://github.com/pytorch/serve/releases/tag/v0.8.0), only metrics that are defined in the metrics config file(default: [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml)) +2. In versions [[v0.8.0](https://github.com/pytorch/serve/releases/tag/v0.8.0) - [v0.9.0](https://github.com/pytorch/serve/releases/tag/v0.9.0)], only metrics that are defined in the metrics config file(default: [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml)) are either all logged to `ts_metrics.log` and `model_metrics.log` or made available via the [metrics API endpoint](metrics_api.md) based on the `metrics_mode` configuration as described [above](#introduction).\ The default `metrics_mode` is `log` mode.\ This is unlike in previous versions where all metrics were only logged to `ts_metrics.log` and `model_metrics.log` except for `ts_inference_requests_total`, `ts_inference_latency_microseconds` and `ts_queue_latency_microseconds` which were only available via the metrics API endpoint.\ **Upgrade paths**: - - **[< v0.8.0] to [>= v0.8.0]**\ + - **[< v0.8.0] to [v0.8.0 - v0.9.0]**\ Specify all the custom metrics added to the custom handler in the metrics configuration file as shown [above](#getting-started-with-torchserve-metrics). diff --git a/examples/custom_metrics/README.md b/examples/custom_metrics/README.md index 2abbd4682d..f6da045e8d 100644 --- a/examples/custom_metrics/README.md +++ b/examples/custom_metrics/README.md @@ -18,9 +18,10 @@ Run the commands given in following steps from the root directory of the reposit - HandlerMethodTime - ExamplePercentMetric - The custom metrics configuration file `metrics.yaml` in this example builds on top of the [default metrics configuration file](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) to include the custom metrics listed above. - The `config.properties` file in this example configures torchserve to use the custom metrics configuration file and sets the `metrics_mode` to `prometheus`. The custom handler - `mnist_handler.py` updates the metrics listed above. + The custom metrics configuration file [metrics.yaml](metrics.yaml) in this example builds on top of the [default metrics configuration file](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) to include the custom metrics listed above. + Note that, `HandlerMethodTime` and `ExamplePercentMetric` are not defined in the [metrics configuration file](metrics.yaml) to demonstrate auto-detection of backend metrics. + The [config.properties](config.properties) file in this example configures torchserve to use the custom metrics configuration file, sets the `metrics_mode` to `prometheus` and enables `model_metrics_auto_detect`. The custom handler + [mnist_handler.py](mnist_handler.py) updates the metrics listed above. Refer: [Custom Metrics](https://github.com/pytorch/serve/blob/master/docs/metrics.md#custom-metrics-api)\ Refer: [Custom Handler](https://github.com/pytorch/serve/blob/master/docs/custom_service.md#custom-handlers) diff --git a/examples/custom_metrics/config.properties b/examples/custom_metrics/config.properties index 02607ac36d..f2dbdf096c 100644 --- a/examples/custom_metrics/config.properties +++ b/examples/custom_metrics/config.properties @@ -1,5 +1,6 @@ metrics_mode=prometheus metrics_config=examples/custom_metrics/metrics.yaml +model_metrics_auto_detect=true models={\ "mnist": {\ "1.0": {\ diff --git a/examples/custom_metrics/metrics.yaml b/examples/custom_metrics/metrics.yaml index 71aa7b5e46..b71c136a06 100644 --- a/examples/custom_metrics/metrics.yaml +++ b/examples/custom_metrics/metrics.yaml @@ -68,6 +68,7 @@ ts_metrics: model_metrics: # Dimension "Hostname" is automatically added for model metrics in the backend + # "HandlerMethodTime" and "ExamplePercentMetric" metrics are not defined here to show auto-detection of backend metrics counter: - name: InferenceRequestCount unit: count @@ -94,10 +95,3 @@ model_metrics: - name: SizeOfImage unit: kB dimensions: [*model_name, *level] - - name: HandlerMethodTime - unit: ms - dimensions: ["MethodName", *model_name, *level] - histogram: - - name: ExamplePercentMetric - unit: percent - dimensions: [*model_name, *level] diff --git a/examples/custom_metrics/mnist_handler.py b/examples/custom_metrics/mnist_handler.py index 632afd5a82..e9e7b220b3 100644 --- a/examples/custom_metrics/mnist_handler.py +++ b/examples/custom_metrics/mnist_handler.py @@ -41,10 +41,6 @@ def initialize(self, context): name="InitializeCallCount", value=1, unit="count", - dimensions=[ - Dimension(name="ModelName", value=context.model_name), - Dimension(name="Level", value="Model"), - ], metric_type=MetricTypes.COUNTER, ) @@ -95,6 +91,7 @@ def preprocess(self, data): # "add_time" will register the metric if not already present in metric cache, # include the "ModelName" and "Level" dimensions by default and emit it + # Note: "HandlerMethodTime" is not defined in "metrics.yaml" and will be auto-detected metrics.add_time( name="HandlerMethodTime", value=(preprocess_stop - preprocess_start) * 1000, @@ -122,6 +119,7 @@ def postprocess(self, data): ) # "add_percent" will register the metric if not already present in metric cache, # include the "ModelName" and "Level" dimensions by default and emit it + # Note: "ExamplePercentMetric" is not defined in "metrics.yaml" and will be auto-detected self.context.metrics.add_percent( name="ExamplePercentMetric", value=50, diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/Metric.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/Metric.java index 4f17deb518..91506eb16e 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/metrics/Metric.java +++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/Metric.java @@ -12,7 +12,7 @@ public class Metric { private static final Pattern PATTERN = Pattern.compile( - "\\s*([\\w\\s]+)\\.([\\w\\s]+):([0-9\\-,.e]+)\\|#([^|]*)\\|#hostname:([^,]+),([^,]+)(,(.*))?"); + "\\s*([\\w\\s]+)\\.([\\w\\s]+):([0-9\\-,.e]+)\\|#([^|]*)(\\|#type:([^|,]+))?\\|#hostname:([^,]+),([^,]+)(,(.*))?"); @SerializedName("MetricName") private String metricName; @@ -23,9 +23,18 @@ public class Metric { @SerializedName("Unit") private String unit; + @SerializedName("Type") + private String type; + @SerializedName("Dimensions") private List dimensions; + @SerializedName("DimensionNames") + private List dimensionNames; + + @SerializedName("DimensionValues") + private List dimensionValues; + @SerializedName("Timestamp") private String timestamp; @@ -41,13 +50,15 @@ public Metric( String metricName, String value, String unit, + String type, String hostName, Dimension... dimensions) { this.metricName = metricName; this.value = value; this.unit = unit; + this.type = type; this.hostName = hostName; - this.dimensions = Arrays.asList(dimensions); + this.setDimensions(Arrays.asList(dimensions)); this.timestamp = String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())); } @@ -92,12 +103,34 @@ public void setUnit(String unit) { this.unit = unit; } + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + public List getDimensions() { return dimensions; } + public List getDimensionNames() { + return this.dimensionNames; + } + + public List getDimensionValues() { + return this.dimensionValues; + } + public void setDimensions(List dimensions) { this.dimensions = dimensions; + this.dimensionNames = new ArrayList(); + this.dimensionValues = new ArrayList(); + for (Dimension dimension : dimensions) { + this.dimensionNames.add(dimension.getName()); + this.dimensionValues.add(dimension.getValue()); + } } public String getTimestamp() { @@ -120,9 +153,10 @@ public static Metric parse(String line) { metric.setUnit(matcher.group(2)); metric.setValue(matcher.group(3)); String dimensions = matcher.group(4); - metric.setHostName(matcher.group(5)); - metric.setTimestamp(matcher.group(6)); - metric.setRequestId(matcher.group(8)); + metric.setType(matcher.group(6)); + metric.setHostName(matcher.group(7)); + metric.setTimestamp(matcher.group(8)); + metric.setRequestId(matcher.group(10)); if (dimensions != null) { String[] dimension = dimensions.split(","); diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCache.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCache.java index 3ac81f3d54..5787701c64 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCache.java +++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCache.java @@ -29,11 +29,7 @@ private MetricCache() throws FileNotFoundException { return; } - MetricBuilder.MetricMode metricsMode = MetricBuilder.MetricMode.LOG; - String metricsConfigMode = ConfigManager.getInstance().getMetricsMode(); - if (metricsConfigMode != null && metricsConfigMode.toLowerCase().contains("prometheus")) { - metricsMode = MetricBuilder.MetricMode.PROMETHEUS; - } + MetricBuilder.MetricMode metricsMode = ConfigManager.getInstance().getMetricsMode(); if (this.config.getTs_metrics() != null) { addMetrics( @@ -106,6 +102,24 @@ public static MetricCache getInstance() { return instance; } + public IMetric addAutoDetectMetricBackend(Metric parsedMetric) { + // The Hostname dimension is included by default for backend metrics + List dimensionNames = parsedMetric.getDimensionNames(); + dimensionNames.add("Hostname"); + + IMetric metric = + MetricBuilder.build( + ConfigManager.getInstance().getMetricsMode(), + MetricBuilder.MetricType.valueOf(parsedMetric.getType()), + parsedMetric.getMetricName(), + parsedMetric.getUnit(), + dimensionNames); + + this.metricsBackend.putIfAbsent(parsedMetric.getMetricName(), metric); + + return metric; + } + public IMetric getMetricFrontend(String metricName) { return metricsFrontend.get(metricName); } diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java index 1585d71e90..fe189bfb6d 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java +++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java @@ -81,12 +81,10 @@ public void run() { } else { if (this.metricCache.getMetricFrontend(metric.getMetricName()) != null) { try { - List dimensionValues = new ArrayList(); - for (Dimension dimension : metric.getDimensions()) { - dimensionValues.add(dimension.getValue()); - } // Frontend metrics by default have the last dimension as Hostname + List dimensionValues = metric.getDimensionValues(); dimensionValues.add(metric.getHostName()); + this.metricCache .getMetricFrontend(metric.getMetricName()) .addOrUpdate( diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java index ec1be05489..6449f0d662 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java +++ b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java @@ -43,6 +43,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.io.IOUtils; +import org.pytorch.serve.metrics.MetricBuilder; import org.pytorch.serve.servingsdk.snapshot.SnapshotSerializer; import org.pytorch.serve.snapshot.SnapshotSerializerFactory; import org.slf4j.Logger; @@ -68,6 +69,7 @@ public final class ConfigManager { private static final String TS_NUMBER_OF_GPU = "number_of_gpu"; private static final String TS_METRICS_CONFIG = "metrics_config"; private static final String TS_METRICS_MODE = "metrics_mode"; + private static final String TS_MODEL_METRICS_AUTO_DETECT = "model_metrics_auto_detect"; private static final String TS_DISABLE_SYSTEM_METRICS = "disable_system_metrics"; // IPEX config option that can be set at config.properties @@ -412,8 +414,23 @@ public String getTorchRunLogDir() { return torchrunLogDir; } - public String getMetricsMode() { - return getProperty(TS_METRICS_MODE, "log"); + public MetricBuilder.MetricMode getMetricsMode() { + String metricsMode = getProperty(TS_METRICS_MODE, "log"); + try { + return MetricBuilder.MetricMode.valueOf( + metricsMode.replaceAll("\\s", "").toUpperCase()); + } catch (IllegalArgumentException | NullPointerException e) { + logger.error( + "Configured metrics mode \"{}\" not supported. Defaulting to \"{}\" mode: {}", + metricsMode, + MetricBuilder.MetricMode.LOG, + e); + return MetricBuilder.MetricMode.LOG; + } + } + + public boolean isModelMetricsAutoDetectEnabled() { + return Boolean.parseBoolean(getProperty(TS_MODEL_METRICS_AUTO_DETECT, "false")); } public boolean isSystemMetricsDisabled() { diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java index 4ee74e88ad..f3a0db10ca 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java +++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java @@ -14,7 +14,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.pytorch.serve.archive.model.ModelConfig; -import org.pytorch.serve.metrics.Dimension; import org.pytorch.serve.metrics.Metric; import org.pytorch.serve.metrics.MetricCache; import org.pytorch.serve.util.ConfigManager; @@ -301,34 +300,42 @@ public void run() { Matcher matcher = METRIC_PATTERN.matcher(result); if (matcher.matches()) { logger.info("result={}, pattern={}", result, matcher.group(2)); + Metric parsedMetric = Metric.parse(matcher.group(3)); - if (parsedMetric != null) { + if (parsedMetric == null) { + logger.error("Failed to parse metrics line: \"{}\".", result); + continue; + } + + try { if (this.metricCache.getMetricBackend(parsedMetric.getMetricName()) - != null) { - try { - List dimensionValues = new ArrayList(); - for (Dimension dimension : parsedMetric.getDimensions()) { - dimensionValues.add(dimension.getValue()); - } - // Hostname is added as a dimension by default to backend - // metrics - dimensionValues.add(parsedMetric.getHostName()); - this.metricCache - .getMetricBackend(parsedMetric.getMetricName()) - .addOrUpdate( - dimensionValues, - parsedMetric.getRequestId(), - Double.parseDouble(parsedMetric.getValue())); - } catch (Exception e) { - logger.error( - "Failed to update backend metric ", - parsedMetric.getMetricName(), - ": ", - e); + == null) { + if (!lifeCycle.configManager.isModelMetricsAutoDetectEnabled()) { + continue; } + + logger.info( + "Registering auto detected backend metric: {}", + parsedMetric); + this.metricCache.addAutoDetectMetricBackend(parsedMetric); } - } else { - logger.error("Failed to parse metrics line: \"{}\".", result); + + // Hostname is added as a dimension by default to backend metrics + List dimensionValues = parsedMetric.getDimensionValues(); + dimensionValues.add(parsedMetric.getHostName()); + + this.metricCache + .getMetricBackend(parsedMetric.getMetricName()) + .addOrUpdate( + dimensionValues, + parsedMetric.getRequestId(), + Double.parseDouble(parsedMetric.getValue())); + } catch (Exception e) { + logger.error( + "Failed to update backend metric ", + parsedMetric.getMetricName(), + ": ", + e); } continue; } diff --git a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricCacheTest.java b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricCacheTest.java index 7170757287..a38bbdaade 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricCacheTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricCacheTest.java @@ -1,5 +1,6 @@ package org.pytorch.serve.metrics; +import java.util.Arrays; import org.pytorch.serve.metrics.format.prometheous.PrometheusCounter; import org.pytorch.serve.metrics.format.prometheous.PrometheusGauge; import org.testng.Assert; @@ -60,4 +61,25 @@ public void testMetricCacheLoadValidConfiguration() { metricCache.getMetricBackend("PredictionTime").getClass(), PrometheusGauge.class); Assert.assertEquals(metricCache.getMetricBackend("InvalidMetric"), null); } + + @Test + public void testMetricCacheAddAutoDetectMetricBackend() { + MetricCache metricCache = MetricCache.getInstance(); + Metric metric = + new Metric( + "TestMetricWithType", + "5.0", + "count", + "GAUGE", + "test-host", + new Dimension("ModelName", "mnist"), + new Dimension("Level", "model")); + metricCache.addAutoDetectMetricBackend(metric); + IMetric cachedMetric = metricCache.getMetricBackend("TestMetricWithType"); + Assert.assertEquals(cachedMetric.type, MetricBuilder.MetricType.GAUGE); + Assert.assertEquals(cachedMetric.name, "TestMetricWithType"); + Assert.assertEquals(cachedMetric.unit, "count"); + Assert.assertEquals( + cachedMetric.dimensionNames, Arrays.asList("ModelName", "Level", "Hostname")); + } } diff --git a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricConfigurationTest.java b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricConfigurationTest.java index ba8b5ca58a..11e6c9bcf6 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricConfigurationTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricConfigurationTest.java @@ -3,6 +3,7 @@ import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.Arrays; +import org.pytorch.serve.metrics.MetricBuilder; import org.pytorch.serve.util.ConfigManager; import org.testng.Assert; import org.testng.annotations.Test; @@ -126,11 +127,11 @@ public void testLoadInvalidConfigurationMissingMetricUnit() { @Test public void testMetricsModeConfiguration() { ConfigManager configManager = ConfigManager.getInstance(); - String existingMetricsModeConfiguration = configManager.getMetricsMode(); - Assert.assertEquals(existingMetricsModeConfiguration, "log"); - configManager.setProperty("metrics_mode", "test_metrics_mode"); - Assert.assertEquals(configManager.getMetricsMode(), "test_metrics_mode"); + MetricBuilder.MetricMode existingMetricsModeConfiguration = configManager.getMetricsMode(); + Assert.assertEquals(existingMetricsModeConfiguration, MetricBuilder.MetricMode.LOG); + configManager.setProperty("metrics_mode", "prometheus"); + Assert.assertEquals(configManager.getMetricsMode(), MetricBuilder.MetricMode.PROMETHEUS); // Restore original metrics mode configuration - configManager.setProperty("metrics_mode", existingMetricsModeConfiguration); + configManager.setProperty("metrics_mode", existingMetricsModeConfiguration.toString()); } } diff --git a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricTest.java b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricTest.java index 5591e93910..ad774766ac 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricTest.java @@ -4,6 +4,7 @@ import java.io.StringWriter; import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.core.Logger; import org.apache.logging.log4j.core.appender.WriterAppender; @@ -266,4 +267,54 @@ public void testFrontendPrometheusHistogram() { testMetricDimensionValues.toArray(new String[0])); Assert.assertEquals(metricValue, Double.valueOf(3.0)); } + + @Test + public void testParseBackendMetricLogWithoutType() { + String backendMetricLog = + "HandlerTime.Milliseconds:71.77|#ModelName:mnist,Level:Model|#hostname:test-host,1699061430,6d1726a7-172c-4010-b671-01d71bacc451"; + Metric parsedMetric = Metric.parse(backendMetricLog); + + Assert.assertEquals("HandlerTime", parsedMetric.getMetricName()); + Assert.assertEquals("Milliseconds", parsedMetric.getUnit()); + Assert.assertEquals("71.77", parsedMetric.getValue()); + List dimensionNames = new ArrayList(); + for (Dimension dimension : parsedMetric.getDimensions()) { + dimensionNames.add(dimension.getName()); + } + Assert.assertEquals(Arrays.asList("ModelName", "Level"), dimensionNames); + List dimensionValues = new ArrayList(); + for (Dimension dimension : parsedMetric.getDimensions()) { + dimensionValues.add(dimension.getValue()); + } + Assert.assertEquals(Arrays.asList("mnist", "Model"), dimensionValues); + Assert.assertEquals(null, parsedMetric.getType()); + Assert.assertEquals("test-host", parsedMetric.getHostName()); + Assert.assertEquals("1699061430", parsedMetric.getTimestamp()); + Assert.assertEquals("6d1726a7-172c-4010-b671-01d71bacc451", parsedMetric.getRequestId()); + } + + @Test + public void testParseBackendMetricLogWithType() { + String backendMetricLog = + "PredictionTime.Milliseconds:71.95|#ModelName:mnist,Level:Model|#type:GAUGE|#hostname:test-host,1699061430,6d1726a7-172c-4010-b671-01d71bacc451"; + Metric parsedMetric = Metric.parse(backendMetricLog); + + Assert.assertEquals("PredictionTime", parsedMetric.getMetricName()); + Assert.assertEquals("Milliseconds", parsedMetric.getUnit()); + Assert.assertEquals("71.95", parsedMetric.getValue()); + List dimensionNames = new ArrayList(); + for (Dimension dimension : parsedMetric.getDimensions()) { + dimensionNames.add(dimension.getName()); + } + Assert.assertEquals(Arrays.asList("ModelName", "Level"), dimensionNames); + List dimensionValues = new ArrayList(); + for (Dimension dimension : parsedMetric.getDimensions()) { + dimensionValues.add(dimension.getValue()); + } + Assert.assertEquals(Arrays.asList("mnist", "Model"), dimensionValues); + Assert.assertEquals("GAUGE", parsedMetric.getType()); + Assert.assertEquals("test-host", parsedMetric.getHostName()); + Assert.assertEquals("1699061430", parsedMetric.getTimestamp()); + Assert.assertEquals("6d1726a7-172c-4010-b671-01d71bacc451", parsedMetric.getRequestId()); + } } diff --git a/test/pytest/test_data/metrics/metrics_auto_detect.yaml b/test/pytest/test_data/metrics/metrics_auto_detect.yaml new file mode 100644 index 0000000000..e2740b3117 --- /dev/null +++ b/test/pytest/test_data/metrics/metrics_auto_detect.yaml @@ -0,0 +1,81 @@ +dimensions: + - &model_name "ModelName" + - &worker_name "WorkerName" + - &level "Level" + - &device_id "DeviceId" + - &hostname "Hostname" + +ts_metrics: + counter: + - name: Requests2XX + unit: Count + dimensions: [*level, *hostname] + - name: Requests4XX + unit: Count + dimensions: [*level, *hostname] + - name: Requests5XX + unit: Count + dimensions: [*level, *hostname] + - name: ts_inference_requests_total + unit: Count + dimensions: ["model_name", "model_version", "hostname"] + - name: ts_inference_latency_microseconds + unit: Microseconds + dimensions: ["model_name", "model_version", "hostname"] + - name: ts_queue_latency_microseconds + unit: Microseconds + dimensions: ["model_name", "model_version", "hostname"] + gauge: + - name: QueueTime + unit: Milliseconds + dimensions: [*level, *hostname] + - name: WorkerThreadTime + unit: Milliseconds + dimensions: [*level, *hostname] + - name: WorkerLoadTime + unit: Milliseconds + dimensions: [*worker_name, *level, *hostname] + - name: CPUUtilization + unit: Percent + dimensions: [*level, *hostname] + - name: MemoryUsed + unit: Megabytes + dimensions: [*level, *hostname] + - name: MemoryAvailable + unit: Megabytes + dimensions: [*level, *hostname] + - name: MemoryUtilization + unit: Percent + dimensions: [*level, *hostname] + - name: DiskUsage + unit: Gigabytes + dimensions: [*level, *hostname] + - name: DiskUtilization + unit: Percent + dimensions: [*level, *hostname] + - name: DiskAvailable + unit: Gigabytes + dimensions: [*level, *hostname] + - name: GPUMemoryUtilization + unit: Percent + dimensions: [*level, *device_id, *hostname] + - name: GPUMemoryUsed + unit: Megabytes + dimensions: [*level, *device_id, *hostname] + - name: GPUUtilization + unit: Percent + dimensions: [*level, *device_id, *hostname] + +model_metrics: + # "PredictionTime" is left undefined to test metrics auto-detection + counter: + - name: PreprocessCallCount + unit: count + dimensions: [*model_name] + gauge: + - name: HandlerTime + unit: ms + dimensions: [*model_name, *level] + - name: RequestBatchSize + unit: count + dimensions: [ "ModelName" ] diff --git a/test/pytest/test_metrics.py b/test/pytest/test_metrics.py index 27547cf471..98b76a798e 100644 --- a/test/pytest/test_metrics.py +++ b/test/pytest/test_metrics.py @@ -3,6 +3,7 @@ import platform import re import shutil +import subprocess import time from os import path @@ -33,16 +34,45 @@ "GPUMemoryUsed", "GPUUtilization", ] -BACKEND_METRICS = ["HandlerTime", "PredictionTime"] +BACKEND_METRICS = [ + "HandlerTime", + "PredictionTime", + "InferenceRequestCount", + "InitializeCallCount", + "PreprocessCallCount", + "PostprocessCallCount", + "RequestBatchSize", + "SizeOfImage", +] +AUTO_DETECT_BACKEND_METRICS = [ + "HandlerMethodTime", + "ExamplePercentMetric", +] def setup_module(module): test_utils.torchserve_cleanup() - response = requests.get( - "https://torchserve.pytorch.org/mar_files/densenet161.mar", allow_redirects=True + + model_archiver_cmd = test_utils.model_archiver_command_builder( + model_name="mnist_custom_metrics", + version="1.0", + model_file=os.path.join( + test_utils.REPO_ROOT, "examples", "image_classifier", "mnist", "mnist.py" + ), + serialized_file=os.path.join( + test_utils.REPO_ROOT, + "examples", + "image_classifier", + "mnist", + "mnist_cnn.pt", + ), + handler=os.path.join( + test_utils.REPO_ROOT, "examples", "custom_metrics", "mnist_handler.py" + ), + export_path=test_utils.MODEL_STORE, ) - with open(path.join(test_utils.MODEL_STORE, "densenet161.mar"), "wb") as f: - f.write(response.content) + model_archiver_cmd = model_archiver_cmd.split(" ") + subprocess.run(model_archiver_cmd) def teardown_module(module): @@ -80,14 +110,20 @@ def run_log_location_var(custom_path=test_utils.ROOT_DIR, no_config_snapshots=Fa assert len(glob.glob(custom_path + "/ts_log.log")) == 1 -def register_densenet161_model_and_make_inference_request(): - test_utils.register_model("densenet161", "densenet161.mar") +def register_model_and_make_inference_request(): + test_utils.register_model("mnist_custom_metrics", "mnist_custom_metrics.mar") data_file = os.path.join( - test_utils.REPO_ROOT, "examples/image_classifier/kitten.jpg" + test_utils.REPO_ROOT, + "examples", + "image_classifier", + "mnist", + "test_data", + "0.png", ) with open(data_file, "rb") as input_data: requests.post( - url=f"http://localhost:8080/predictions/densenet161", data=input_data + url=f"http://localhost:8080/predictions/mnist_custom_metrics", + data=input_data, ) @@ -177,7 +213,7 @@ def test_log_location_var_snapshot_enabled(): test_utils.stop_torchserve() os.environ["LOG_LOCATION"] = test_utils.ROOT_DIR run_log_location_var(no_config_snapshots=False) - requests.post("http://127.0.0.1:8081/models?url=densenet161.mar") + requests.post("http://127.0.0.1:8081/models?url=mnist_custom_metrics.mar") # We stop torchserve again here so that we can remove the LOG_LOCATION setting from environment variable test_utils.stop_torchserve() print("Waiting to stop") @@ -208,7 +244,7 @@ def test_async_logging(): for f in glob.glob("logs/*.log"): os.remove(f) # delete_all_snapshots() - async_config_file = test_utils.ROOT_DIR + "async-log-config.properties" + async_config_file = os.path.join(test_utils.ROOT_DIR, "async-log-config.properties") with open(async_config_file, "w+") as f: f.write("async_logging=true") test_utils.start_torchserve(snapshot_file=async_config_file) @@ -224,7 +260,7 @@ def test_async_logging_non_boolean(): for f in glob.glob("logs/*.log"): os.remove(f) # delete_all_snapshots() - async_config_file = test_utils.ROOT_DIR + "async-log-config.properties" + async_config_file = os.path.join(test_utils.ROOT_DIR, "async-log-config.properties") with open(async_config_file, "w+") as f: f.write("async_logging=2") test_utils.start_torchserve(snapshot_file=async_config_file) @@ -275,7 +311,7 @@ def test_metrics_location_var_snapshot_enabled(): test_utils.stop_torchserve() os.environ["METRICS_LOCATION"] = test_utils.ROOT_DIR run_metrics_location_var(no_config_snapshots=False) - requests.post("http://127.0.0.1:8081/models?url=densenet161.mar") + requests.post("http://127.0.0.1:8081/models?url=mnist_custom_metrics.mar") # We stop torchserve again here so that we can remove the METRICS_LOCATION setting # from environment variable test_utils.stop_torchserve() @@ -300,7 +336,7 @@ def test_log_location_and_metric_location_vars_snapshot_enabled(): os.environ["METRICS_LOCATION"] = test_utils.ROOT_DIR run_log_location_var(no_config_snapshots=False) run_metrics_location_var(no_config_snapshots=False) - requests.post("http://127.0.0.1:8081/models?url=densenet161.mar") + requests.post("http://127.0.0.1:8081/models?url=mnist_custom_metrics.mar") # We stop torchserve again here so that we can remove the LOG_LOCATION & METRICS_LOCATION # setting from environment variable test_utils.stop_torchserve() @@ -362,7 +398,7 @@ def test_metrics_location_var_snapshot_enabled_rdonly_dir(): os.environ["METRICS_LOCATION"] = RDONLY_DIR try: run_metrics_location_var(custom_path=RDONLY_DIR, no_config_snapshots=False) - requests.post("http://127.0.0.1:8081/models?url=densenet161.mar") + requests.post("http://127.0.0.1:8081/models?url=mnist_custom_metrics.mar") assert len(glob.glob("logs/access_log.log")) == 1 assert len(glob.glob("logs/model_log.log")) == 1 assert len(glob.glob("logs/ts_log.log")) == 1 @@ -384,19 +420,31 @@ def test_metrics_log_mode(): for f in glob.glob("logs/*.log"): os.remove(f) + config_file = os.path.join(test_utils.ROOT_DIR, "config.properties") + with open(config_file, "w") as f: + f.write("enable_envvars_config=true") + + os.environ["TS_METRICS_CONFIG"] = os.path.join( + test_utils.REPO_ROOT, "examples", "custom_metrics", "metrics.yaml" + ) + try: test_utils.start_torchserve( model_store=test_utils.MODEL_STORE, + snapshot_file=config_file, no_config_snapshots=True, gen_mar=False, ) - register_densenet161_model_and_make_inference_request() + register_model_and_make_inference_request() validate_metrics_log("ts_metrics.log", FRONTEND_METRICS, True) validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, True) validate_metrics_log("model_metrics.log", BACKEND_METRICS, True) + validate_metrics_log("model_metrics.log", AUTO_DETECT_BACKEND_METRICS, False) finally: test_utils.stop_torchserve() test_utils.delete_all_snapshots() + del os.environ["TS_METRICS_CONFIG"] + os.remove(config_file) def test_metrics_prometheus_mode(): @@ -410,11 +458,14 @@ def test_metrics_prometheus_mode(): for f in glob.glob("logs/*.log"): os.remove(f) - config_file = test_utils.ROOT_DIR + "config.properties" + config_file = os.path.join(test_utils.ROOT_DIR, "config.properties") with open(config_file, "w") as f: f.write("enable_envvars_config=true") os.environ["TS_METRICS_MODE"] = "prometheus" + os.environ["TS_METRICS_CONFIG"] = os.path.join( + test_utils.REPO_ROOT, "examples", "custom_metrics", "metrics.yaml" + ) try: test_utils.start_torchserve( @@ -423,10 +474,11 @@ def test_metrics_prometheus_mode(): no_config_snapshots=True, gen_mar=False, ) - register_densenet161_model_and_make_inference_request() + register_model_and_make_inference_request() validate_metrics_log("ts_metrics.log", FRONTEND_METRICS, False) validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, False) validate_metrics_log("model_metrics.log", BACKEND_METRICS, False) + validate_metrics_log("model_metrics.log", AUTO_DETECT_BACKEND_METRICS, False) response = requests.get("http://localhost:8082/metrics") prometheus_metrics = response.text @@ -436,34 +488,190 @@ def test_metrics_prometheus_mode(): assert metric_name in prometheus_metrics for metric_name in BACKEND_METRICS: assert metric_name in prometheus_metrics + for metric_name in AUTO_DETECT_BACKEND_METRICS: + assert metric_name not in prometheus_metrics prometheus_metric_patterns = [ + r"TYPE Requests2XX counter", r'Requests2XX\{Level="Host",Hostname=".+",\} \d+\.\d+', - r'ts_inference_requests_total\{model_name="densenet161",model_version="default",hostname=".+",\} \d+\.\d+', - r'ts_inference_latency_microseconds\{model_name="densenet161",model_version="default",hostname=".+",\} \d+\.\d+', - r'ts_queue_latency_microseconds\{model_name="densenet161",model_version="default",hostname=".+",\} \d+\.\d+', + r"TYPE ts_inference_requests_total counter", + r'ts_inference_requests_total\{model_name="mnist_custom_metrics",model_version="default",hostname=".+",\} \d+\.\d+', + r"TYPE ts_inference_latency_microseconds counter", + r'ts_inference_latency_microseconds\{model_name="mnist_custom_metrics",model_version="default",hostname=".+",\} \d+\.\d+', + r"TYPE ts_queue_latency_microseconds counter", + r'ts_queue_latency_microseconds\{model_name="mnist_custom_metrics",model_version="default",hostname=".+",\} \d+\.\d+', + r"TYPE QueueTime gauge", r'QueueTime\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE WorkerThreadTime gauge", r'WorkerThreadTime\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE WorkerLoadTime gauge", r'WorkerLoadTime\{WorkerName=".+",Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE CPUUtilization gauge", r'CPUUtilization\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE MemoryUsed gauge", r'MemoryUsed\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE MemoryAvailable gauge", r'MemoryAvailable\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE MemoryUtilization gauge", r'MemoryUtilization\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE DiskUsage gauge", r'DiskUsage\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE DiskUtilization gauge", r'DiskUtilization\{Level="Host",Hostname=".+",\} \d+\.\d+', + r"TYPE DiskAvailable gauge", r'DiskAvailable\{Level="Host",Hostname=".+",\} \d+\.\d+', - r'HandlerTime\{ModelName="densenet161",Level="Model",Hostname=".+",\} \d+\.\d+', - r'PredictionTime\{ModelName="densenet161",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE HandlerTime gauge", + r'HandlerTime\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE PredictionTime gauge", + r'PredictionTime\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE InferenceRequestCount counter", + r'InferenceRequestCount\{Hostname=".+",\} \d+\.\d+', + r"TYPE InitializeCallCount counter", + r'InitializeCallCount\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE PreprocessCallCount counter", + r'PreprocessCallCount\{ModelName="mnist_custom_metrics",Hostname=".+",\} \d+\.\d+', + r"TYPE PostprocessCallCount counter", + r'PostprocessCallCount\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE RequestBatchSize gauge", + r'RequestBatchSize\{ModelName="mnist_custom_metrics",Hostname=".+",\} \d+\.\d+', + r"TYPE SizeOfImage gauge", + r'SizeOfImage\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + ] + + for pattern in prometheus_metric_patterns: + matches = re.findall(pattern, prometheus_metrics) + assert len(matches) == 1, "pattern not found: " + pattern + + finally: + test_utils.stop_torchserve() + test_utils.delete_all_snapshots() + del os.environ["TS_METRICS_MODE"] + del os.environ["TS_METRICS_CONFIG"] + os.remove(config_file) + + +def test_auto_detect_backend_metrics_log_mode(): + """ + Validates that auto-detection of backend metrics works with log mode + """ + # Torchserve cleanup + test_utils.stop_torchserve() + test_utils.delete_all_snapshots() + # Remove existing logs if any + for f in glob.glob("logs/*.log"): + os.remove(f) + + config_file = os.path.join(test_utils.ROOT_DIR, "config.properties") + with open(config_file, "w") as f: + f.write("enable_envvars_config=true") + + os.environ["TS_METRICS_CONFIG"] = os.path.join( + test_utils.REPO_ROOT, + "test", + "pytest", + "test_data", + "metrics", + "metrics_auto_detect.yaml", + ) + os.environ["TS_MODEL_METRICS_AUTO_DETECT"] = "true" + + try: + test_utils.start_torchserve( + model_store=test_utils.MODEL_STORE, + snapshot_file=config_file, + no_config_snapshots=True, + gen_mar=False, + ) + register_model_and_make_inference_request() + validate_metrics_log("model_metrics.log", BACKEND_METRICS, True) + validate_metrics_log("model_metrics.log", AUTO_DETECT_BACKEND_METRICS, True) + finally: + test_utils.stop_torchserve() + test_utils.delete_all_snapshots() + del os.environ["TS_METRICS_CONFIG"] + del os.environ["TS_MODEL_METRICS_AUTO_DETECT"] + os.remove(config_file) + + +def test_auto_detect_backend_metrics_prometheus_mode(): + """ + Validates that auto-detection of backend metrics works with prometheus mode + """ + # Torchserve cleanup + test_utils.stop_torchserve() + test_utils.delete_all_snapshots() + # Remove existing logs if any + for f in glob.glob("logs/*.log"): + os.remove(f) + + config_file = os.path.join(test_utils.ROOT_DIR, "config.properties") + with open(config_file, "w") as f: + f.write("enable_envvars_config=true") + + os.environ["TS_METRICS_MODE"] = "prometheus" + os.environ["TS_METRICS_CONFIG"] = os.path.join( + test_utils.REPO_ROOT, + "test", + "pytest", + "test_data", + "metrics", + "metrics_auto_detect.yaml", + ) + os.environ["TS_MODEL_METRICS_AUTO_DETECT"] = "true" + + try: + test_utils.start_torchserve( + model_store=test_utils.MODEL_STORE, + snapshot_file=config_file, + no_config_snapshots=True, + gen_mar=False, + ) + register_model_and_make_inference_request() + + validate_metrics_log("model_metrics.log", BACKEND_METRICS, False) + validate_metrics_log("model_metrics.log", AUTO_DETECT_BACKEND_METRICS, False) + + response = requests.get("http://localhost:8082/metrics") + prometheus_metrics = response.text + for metric_name in BACKEND_METRICS: + assert metric_name in prometheus_metrics + for metric_name in AUTO_DETECT_BACKEND_METRICS: + assert metric_name in prometheus_metrics + + prometheus_metric_patterns = [ + r"TYPE HandlerTime gauge", + r'HandlerTime\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE PredictionTime gauge", + r'PredictionTime\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE InferenceRequestCount counter", + r'InferenceRequestCount\{Hostname=".+",\} \d+\.\d+', + r"TYPE InitializeCallCount counter", + r'InitializeCallCount\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE PreprocessCallCount counter", + r'PreprocessCallCount\{ModelName="mnist_custom_metrics",Hostname=".+",\} \d+\.\d+', + r"TYPE PostprocessCallCount counter", + r'PostprocessCallCount\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE RequestBatchSize gauge", + r'RequestBatchSize\{ModelName="mnist_custom_metrics",Hostname=".+",\} \d+\.\d+', + r"TYPE SizeOfImage gauge", + r'SizeOfImage\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE HandlerMethodTime gauge", + r'HandlerMethodTime\{MethodName="preprocess",ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r"TYPE ExamplePercentMetric histogram", + r'ExamplePercentMetric_sum\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', + r'ExamplePercentMetric_count\{ModelName="mnist_custom_metrics",Level="Model",Hostname=".+",\} \d+\.\d+', ] for pattern in prometheus_metric_patterns: matches = re.findall(pattern, prometheus_metrics) - assert len(matches) == 1 + assert len(matches) == 1, "pattern not found: " + pattern finally: test_utils.stop_torchserve() test_utils.delete_all_snapshots() del os.environ["TS_METRICS_MODE"] + del os.environ["TS_METRICS_CONFIG"] + del os.environ["TS_MODEL_METRICS_AUTO_DETECT"] os.remove(config_file) @@ -478,15 +686,28 @@ def test_collect_system_metrics_when_not_disabled(): for f in glob.glob("logs/*.log"): os.remove(f) + config_file = os.path.join(test_utils.ROOT_DIR, "config.properties") + with open(config_file, "w") as f: + f.write("enable_envvars_config=true") + + os.environ["TS_METRICS_CONFIG"] = os.path.join( + test_utils.REPO_ROOT, "examples", "custom_metrics", "metrics.yaml" + ) + try: test_utils.start_torchserve( - model_store=test_utils.MODEL_STORE, no_config_snapshots=True, gen_mar=False + model_store=test_utils.MODEL_STORE, + snapshot_file=config_file, + no_config_snapshots=True, + gen_mar=False, ) - register_densenet161_model_and_make_inference_request() + register_model_and_make_inference_request() validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, True) finally: test_utils.stop_torchserve() test_utils.delete_all_snapshots() + del os.environ["TS_METRICS_CONFIG"] + os.remove(config_file) def test_disable_system_metrics_using_config_properties(): @@ -501,9 +722,13 @@ def test_disable_system_metrics_using_config_properties(): for f in glob.glob("logs/*.log"): os.remove(f) - config_file = test_utils.ROOT_DIR + "config.properties" + config_file = os.path.join(test_utils.ROOT_DIR, "config.properties") with open(config_file, "w") as f: - f.write("disable_system_metrics=true") + f.writelines(["enable_envvars_config=true\n", "disable_system_metrics=true"]) + + os.environ["TS_METRICS_CONFIG"] = os.path.join( + test_utils.REPO_ROOT, "examples", "custom_metrics", "metrics.yaml" + ) try: test_utils.start_torchserve( @@ -512,11 +737,12 @@ def test_disable_system_metrics_using_config_properties(): no_config_snapshots=True, gen_mar=False, ) - register_densenet161_model_and_make_inference_request() + register_model_and_make_inference_request() validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, False) finally: test_utils.stop_torchserve() test_utils.delete_all_snapshots() + del os.environ["TS_METRICS_CONFIG"] os.remove(config_file) @@ -532,11 +758,14 @@ def test_disable_system_metrics_using_environment_variable(): for f in glob.glob("logs/*.log"): os.remove(f) - config_file = test_utils.ROOT_DIR + "config.properties" + config_file = os.path.join(test_utils.ROOT_DIR, "config.properties") with open(config_file, "w") as f: f.write("enable_envvars_config=true") os.environ["TS_DISABLE_SYSTEM_METRICS"] = "true" + os.environ["TS_METRICS_CONFIG"] = os.path.join( + test_utils.REPO_ROOT, "examples", "custom_metrics", "metrics.yaml" + ) try: test_utils.start_torchserve( @@ -545,10 +774,11 @@ def test_disable_system_metrics_using_environment_variable(): no_config_snapshots=True, gen_mar=False, ) - register_densenet161_model_and_make_inference_request() + register_model_and_make_inference_request() validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, False) finally: test_utils.stop_torchserve() test_utils.delete_all_snapshots() del os.environ["TS_DISABLE_SYSTEM_METRICS"] + del os.environ["TS_METRICS_CONFIG"] os.remove(config_file) diff --git a/ts/metrics/caching_metric.py b/ts/metrics/caching_metric.py index 89f99b6d34..57330554c3 100644 --- a/ts/metrics/caching_metric.py +++ b/ts/metrics/caching_metric.py @@ -102,7 +102,7 @@ def emit_metrics( """ metric_str = ( f"[METRICS]{self.metric_name}.{self.unit}:{value}|#{dimension_string}|" - f"#hostname:{socket.gethostname()},{int(time.time())}" + f"#type:{self.metric_type.name}|#hostname:{socket.gethostname()},{int(time.time())}" ) if request_id: logger.info(f"{metric_str},{request_id}") diff --git a/ts/model_loader.py b/ts/model_loader.py index fb8dcd161b..3cbc5a6b9c 100644 --- a/ts/model_loader.py +++ b/ts/model_loader.py @@ -5,6 +5,7 @@ import json import logging import os +import uuid from abc import ABCMeta, abstractmethod from builtins import str from typing import Optional @@ -89,6 +90,13 @@ def load( :return: """ logging.debug("Loading model - working dir: %s", os.getcwd()) + + # Backwards Compatibility with releases <=0.6.0 + # Request ID is not set for model load requests + # TODO: UUID serves as a temporary request ID for model load requests + if metrics_cache is not None: + metrics_cache.set_request_ids(str(uuid.uuid4())) + manifest_file = os.path.join(model_dir, "MAR-INF", "MANIFEST.json") manifest = None if os.path.exists(manifest_file): diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index b2b05a22b7..ee0879c2d9 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1137,3 +1137,4 @@ Naver FlashAttention GenAI prem +CachingMetric