From f4066f2cbe41c8ec447c395587860cbc8a6b2703 Mon Sep 17 00:00:00 2001 From: Robert Samoilescu Date: Fri, 12 Jul 2024 15:35:41 +0100 Subject: [PATCH] Updated docs. --- docs/examples/streaming/README.ipynb | 57 ++++++++++++++++++--------- docs/examples/streaming/README.md | 8 +--- docs/examples/streaming/settings.json | 3 +- docs/examples/streaming/text_model.py | 14 +------ docs/user-guide/streaming.md | 1 - 5 files changed, 42 insertions(+), 41 deletions(-) diff --git a/docs/examples/streaming/README.ipynb b/docs/examples/streaming/README.ipynb index 025246237..1273b26c4 100644 --- a/docs/examples/streaming/README.ipynb +++ b/docs/examples/streaming/README.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -138,8 +138,7 @@ "{\n", " \"debug\": false,\n", " \"parallel_workers\": 0,\n", - " \"gzip_enabled\": false,\n", - " \"metrics_endpoint\": null\n", + " \"gzip_enabled\": false\n", "}\n" ] }, @@ -150,8 +149,7 @@ "Note the currently there are three main limitations of the streaming support in MLServer:\n", "\n", "- distributed workers are not supported (i.e., the `parallel_workers` setting should be set to `0`)\n", - "- `gzip` middleware is not supported for REST (i.e., `gzip_enabled` setting should be set to `false`)\n", - "- metrics endpoint is not available (i.e. `metrics_endpoint` is also disabled for streaming for gRPC)" + "- `gzip` middleware is not supported for REST (i.e., `gzip_enabled` setting should be set to `false`)" ] }, { @@ -163,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -227,14 +225,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Writing generate-request.json\n" + "Overwriting generate-request.json\n" ] } ], @@ -272,9 +270,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['What']\n", + "[' is']\n", + "[' the']\n", + "[' capital']\n", + "[' of']\n", + "[' France?']\n" + ] + } + ], "source": [ "import httpx\n", "from httpx_sse import connect_sse\n", @@ -301,9 +312,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['What']\n", + "[' is']\n", + "[' the']\n", + "[' capital']\n", + "[' of']\n", + "[' France?']\n" + ] + } + ], "source": [ "import grpc\n", "import mlserver.types as types\n", @@ -315,7 +339,7 @@ "inference_request = types.InferenceRequest.parse_file(\"./generate-request.json\")\n", "\n", "# need to convert from string to bytes for grpc\n", - "inference_request.inputs[0] = StringCodec.encode_input(\"prompt\", inference_request.inputs[0].data.__root__)\n", + "inference_request.inputs[0] = StringCodec.encode_input(\"prompt\", inference_request.inputs[0].data.root)\n", "inference_request_g = converters.ModelInferRequestConverter.from_types(\n", " inference_request, model_name=\"text-model\", model_version=None\n", ")\n", @@ -338,11 +362,6 @@ "source": [ "Note that for gRPC, the request is transformed into an async generator which is then passed to the `ModelStreamInfer` method. The response is also an async generator which can be iterated over to get the response." ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { diff --git a/docs/examples/streaming/README.md b/docs/examples/streaming/README.md index 7acdf2090..d91aa7492 100644 --- a/docs/examples/streaming/README.md +++ b/docs/examples/streaming/README.md @@ -78,8 +78,7 @@ The next step will be to create 2 configuration files: { "debug": false, "parallel_workers": 0, - "gzip_enabled": false, - "metrics_endpoint": null + "gzip_enabled": false } ``` @@ -88,7 +87,6 @@ Note the currently there are three main limitations of the streaming support in - distributed workers are not supported (i.e., the `parallel_workers` setting should be set to `0`) - `gzip` middleware is not supported for REST (i.e., `gzip_enabled` setting should be set to `false`) -- metrics endpoint is not available (i.e. `metrics_endpoint` is also disabled for streaming for gRPC) #### model-settings.json @@ -195,7 +193,7 @@ import mlserver.grpc.dataplane_pb2_grpc as dataplane inference_request = types.InferenceRequest.parse_file("./generate-request.json") # need to convert from string to bytes for grpc -inference_request.inputs[0] = StringCodec.encode_input("prompt", inference_request.inputs[0].data.__root__) +inference_request.inputs[0] = StringCodec.encode_input("prompt", inference_request.inputs[0].data.root) inference_request_g = converters.ModelInferRequestConverter.from_types( inference_request, model_name="text-model", model_version=None ) @@ -213,5 +211,3 @@ async with grpc.aio.insecure_channel("localhost:8081") as grpc_channel: ``` Note that for gRPC, the request is transformed into an async generator which is then passed to the `ModelStreamInfer` method. The response is also an async generator which can be iterated over to get the response. - - diff --git a/docs/examples/streaming/settings.json b/docs/examples/streaming/settings.json index ec853b3ba..3a95c2882 100644 --- a/docs/examples/streaming/settings.json +++ b/docs/examples/streaming/settings.json @@ -2,6 +2,5 @@ { "debug": false, "parallel_workers": 0, - "gzip_enabled": false, - "metrics_endpoint": null + "gzip_enabled": false } diff --git a/docs/examples/streaming/text_model.py b/docs/examples/streaming/text_model.py index 4475b3c92..35b167bb5 100644 --- a/docs/examples/streaming/text_model.py +++ b/docs/examples/streaming/text_model.py @@ -1,3 +1,4 @@ + import asyncio from typing import AsyncIterator from mlserver import MLModel @@ -7,19 +8,6 @@ class TextModel(MLModel): - async def predict(self, payload: InferenceRequest) -> InferenceResponse: - text = StringCodec.decode_input(payload.inputs[0])[0] - return InferenceResponse( - model_name=self._settings.name, - outputs=[ - StringCodec.encode_output( - name="output", - payload=[text], - use_bytes=True, - ), - ], - ) - async def predict_stream( self, payloads: AsyncIterator[InferenceRequest] ) -> AsyncIterator[InferenceResponse]: diff --git a/docs/user-guide/streaming.md b/docs/user-guide/streaming.md index 41dec0b03..a576e6a3e 100644 --- a/docs/user-guide/streaming.md +++ b/docs/user-guide/streaming.md @@ -32,4 +32,3 @@ There are three main limitations of the streaming support in MLServer: - the `parallel_workers` setting should be set to `0` to disable distributed workers (to be addressed in future releases) - for REST, the `gzip_enabled` setting should be set to `false` to disable GZIP compression, as streaming is not compatible with GZIP compression (see issue [here]( https://github.com/encode/starlette/issues/20#issuecomment-704106436)) -- `metrics_endpoint` is also disabled for streaming for gRPC (to be addressed in future releases) \ No newline at end of file