pytorch · lxning · Nov 8, 2023 · Aug 1, 2023 · Aug 5, 2023 · Aug 7, 2023
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -36,6 +36,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           sudo apt-get update -y

diff --git a/.github/workflows/ci_cpu.yml b/.github/workflows/ci_cpu.yml
@@ -34,6 +34,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -38,6 +38,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -33,6 +33,8 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v3
+      with:
+        submodules: recursive
 
     - name: Setup Python 3.8
       uses: actions/setup-python@v4

diff --git a/.github/workflows/docker-ci.yaml b/.github/workflows/docker-ci.yaml
@@ -17,6 +17,8 @@ jobs:
         python-version: ["3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
       - name: Test build_image.sh script with custom tagging and gpu flag
         working-directory: docker

diff --git a/.github/workflows/docker-nightly-build.yml b/.github/workflows/docker-nightly-build.yml
@@ -22,6 +22,8 @@ jobs:
           architecture: x64
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Login to Docker
         env:
           DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}

diff --git a/.github/workflows/regression_tests_cpu.yml b/.github/workflows/regression_tests_cpu.yml
@@ -34,6 +34,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.github/workflows/regression_tests_cpu_binaries.yml b/.github/workflows/regression_tests_cpu_binaries.yml
@@ -21,6 +21,8 @@ jobs:
         binaries: ["pypi", "conda"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Setup conda with Python ${{ matrix.python-version }}
         uses: s-weigand/setup-conda@v1
         with:

diff --git a/.github/workflows/regression_tests_docker.yml b/.github/workflows/regression_tests_docker.yml
@@ -28,6 +28,8 @@ jobs:
           docker system prune -f
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Branch name
         run: |
           echo $GITHUB_REF_NAME

diff --git a/.github/workflows/regression_tests_gpu.yml b/.github/workflows/regression_tests_gpu.yml
@@ -41,6 +41,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121

diff --git a/.github/workflows/regression_tests_gpu_binaries.yml b/.github/workflows/regression_tests_gpu_binaries.yml
@@ -32,6 +32,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - uses: conda-incubator/setup-miniconda@v2
         with:
           miniconda-version: "latest"

diff --git a/.github/workflows/torchserve-nightly-build.yml b/.github/workflows/torchserve-nightly-build.yml
@@ -14,6 +14,8 @@ jobs:
       - run: conda install -y conda-build anaconda-client
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/google/rpc"]
+	path = third_party/google/rpc
+	url = https://github.com/googleapis/googleapis.git
diff --git a/docs/grpc_api.md b/docs/grpc_api.md
@@ -38,7 +38,7 @@ cd serve
  - Install gRPC python dependencies
 
 ```bash
-pip install -U grpcio protobuf grpcio-tools
+pip install -U grpcio protobuf grpcio-tools googleapis-common-protos
 ```
 
  - Start torchServe
@@ -51,7 +51,7 @@ torchserve --start --model-store models/
  - Generate python gRPC client stub using the proto files
 
 ```bash
-python -m grpc_tools.protoc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
+python -m grpc_tools.protoc -I third_party/google/rpc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
 ```
 
  - Register densenet161 model
@@ -95,4 +95,4 @@ def handle(data, context):
         for i in range (3):
             send_intermediate_predict_response(["intermediate_response"], context.request_ids, "Intermediate Prediction success", 200, context)
         return ["hello world "]
-```
+```
diff --git a/docs/images/stateful_batch.jpg b/docs/images/stateful_batch.jpg
diff --git a/examples/stateful/Readme.md b/examples/stateful/Readme.md
@@ -0,0 +1,115 @@
+# Stateful Inference
+
+A stateful model possesses the ability to discern interdependencies between successive inference requests. This type of model maintains a persistent state across inference requests, thereby establishing a linkage between the outcomes of prior inquiries and those that follow. Notable illustrations of stateful models encompass online speech recognition systems, such as the Long Short-Term Memory (LSTM) model. Employing stateful inference mandates that the model server adheres to the sequential order of inference requests, ensuring predictions build upon the previous outcomes.
+
+Within this context, TorchServe offers a mechanism known as sequence batching. This approach involves the retrieval of an individual inference request from a particular sequence, followed by the amalgamation of multiple requests originating from diverse sequences into a unified batch. Each request is associated with a unique sequence ID, which can be extracted using the "get_sequence_id" function of context.py. This sequence ID serves as a key employed by custom handlers to store and retrieve values within the backend cache store, fostering efficient management of stateful inference processes.
+
+This example serves as a practical showcase of employing stateful inference. Underneath the surface, the backend leverages an [LRU dictionary](https://github.com/amitdev/lru-dict), functioning as a caching layer.
+
+
+
+### Step 1: Implement handler
+
+stateful_handler.py is an example of stateful handler. It creates a cache `self.cache` by calling `[LRU](https://github.com/amitdev/lru-dict)`.
+
+```python
+    def initialize(self, ctx: Context):
+        """
+        Loads the model and Initializes the necessary artifacts
+        """
+
+        super().initialize(ctx)
+        self.context = ctx
+        if self.context.model_yaml_config["handler"] is not None:
+            self.cache = LRU(int(self.context.model_yaml_config["handler"]["cache"]["capacity"]))
+```
+
+Handler uses sequenceId (ie., `sequence_id = self.context.get_sequence_id(idx)`) as key to store and fetch values from `self.cache`.
+
+```python
+    def preprocess(self, data):
+        """
+        Preprocess function to convert the request input to a tensor(Torchserve supported format).
+        The user needs to override to customize the pre-processing
+
+        Args :
+            data (list): List of the data from the request input.
+
+        Returns:
+            tensor: Returns the tensor data of the input
+        """
+
+        self.sequence_ids = {}
+        results = []
+        for idx, row in enumerate(data):
+            sequence_id = self.context.get_sequence_id(idx)
+
+            prev = None
+            if self.cache.has_key(sequence_id):
+                prev = int(self.cache[sequence_id])
+            else:
+                prev = int(0)
+
+            request = row.get("data") or row.get("body")
+            if isinstance(request, (bytes, bytearray)):
+                request = request.decode("utf-8")
+
+            val = prev + int(request)
+            self.cache[sequence_id] = val
+            results.append(val)
+
+        return results
+```
+
+### Step 2: Model configuration
+
+Stateful inference has three parameters.
+* sequenceMaxIdleMSec: the max idle in milliseconds of a sequence inference request of this stateful model. The default value is 0 (ie. this is not a stateful model.)
+* maxNumSequence: the max number of sequence inference requests of this stateful model. The default value is minWorkers * batchSize.
+* maxSequenceJobQueueSize: the job queue size of an inference sequence of this stateful model. The default value is 1.
+
+
+```yaml
+#cat model-config.yaml
+
+minWorkers: 2
+maxWorkers: 2
+batchSize: 4
+sequenceMaxIdleMSec: 60000
+maxNumSequence: 4
+maxSequenceJobQueueSize: 10
+
+handler:
+  cache:
+    capacity: 4
+```
+
+### Step 3: Generate mar or tgz file
+
+```bash
+torch-model-archiver --model-name stateful --version 1.0 --model-file model.py --serialized-file model_cnn.pt --handler stateful_handler.py -r requirements.txt --config-file model-config.yaml
+```
+
+### Step 4: Start torchserve
+
+```bash
+torchserve --start --ncs --model-store model_store --models stateful.mar
+```
+
+### Step 6: Build GRPC Client
+The details can be found at [here](https://github.com/pytorch/serve/blob/master/docs/grpc_api.md).
+* Install gRPC python dependencies
+* Generate python gRPC client stub using the proto files
+
+### Step 7: Run inference
+* Start TorchServe
+
+```bash
+torchserve --ncs --start --model-store models --model stateful.mar --ts-config config.properties
+```
+
+* Run sequence inference
+```bash
+cd ../../
+python ts_scripts/torchserve_grpc_client.py  infer_stream2 stateful seq_0 examples/stateful/sample/sample1.txt,examples/stateful/sample/sample2.txt,examples/stateful/sample/sample3.txt
+```
diff --git a/examples/stateful/config.properties b/examples/stateful/config.properties
@@ -0,0 +1,13 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+
+number_of_netty_threads=32
+job_queue_size=1000
+
+vmargs=-Xmx4g -XX:+ExitOnOutOfMemoryError -XX:+HeapDumpOnOutOfMemoryError
+prefer_direct_buffer=True
+
+default_response_timeout=300
+unregister_model_timeout=300
+install_py_dep_per_model=true
+enable_envvars_config=true
diff --git a/examples/stateful/model-config.yaml b/examples/stateful/model-config.yaml
@@ -0,0 +1,10 @@
+minWorkers: 2
+maxWorkers: 2
+batchSize: 4
+sequenceMaxIdleMSec: 1000
+maxNumSequence: 4
+maxSequenceJobQueueSize: 10
+
+handler:
+  cache:
+    capacity: 4
diff --git a/examples/stateful/model.py b/examples/stateful/model.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout2d(0.25)
+        self.dropout2 = nn.Dropout2d(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
diff --git a/examples/stateful/model_cnn.pt b/examples/stateful/model_cnn.pt
diff --git a/examples/stateful/requirements.txt b/examples/stateful/requirements.txt
@@ -0,0 +1 @@
+lru-dict
diff --git a/examples/stateful/sample/sample1.txt b/examples/stateful/sample/sample1.txt
@@ -0,0 +1 @@
+1
diff --git a/examples/stateful/sample/sample2.txt b/examples/stateful/sample/sample2.txt
@@ -0,0 +1 @@
+2
diff --git a/examples/stateful/sample/sample3.txt b/examples/stateful/sample/sample3.txt
@@ -0,0 +1 @@
+3
diff --git a/examples/stateful/stateful_handler.py b/examples/stateful/stateful_handler.py
@@ -0,0 +1,79 @@
+import logging
+from abc import ABC
+from typing import Dict
+
+from lru import LRU
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+
+
+class StatefulHandler(BaseHandler, ABC):
+    def __init__(self):
+        super().__init__()
+        self.cache: LRU = None
+        self.sequence_ids: Dict = None
+        self.context = None
+
+    def initialize(self, ctx: Context):
+        """
+        Loads the model and Initializes the necessary artifacts
+        """
+
+        super().initialize(ctx)
+        self.context = ctx
+        if self.context.model_yaml_config["handler"] is not None:
+            self.cache = LRU(
+                int(self.context.model_yaml_config["handler"]["cache"]["capacity"])
+            )
+
+        self.initialized = True
+
+    def preprocess(self, data):
+        """
+        Preprocess function to convert the request input to a tensor(Torchserve supported format).
+        The user needs to override to customize the pre-processing
+
+        Args :
+            data (list): List of the data from the request input.
+
+        Returns:
+            tensor: Returns the tensor data of the input
+        """
+
+        self.sequence_ids = {}
+        results = []
+        for idx, row in enumerate(data):
+            sequence_id = self.context.get_sequence_id(idx)
+
+            prev = None
+            if self.cache.has_key(sequence_id):
+                prev = int(self.cache[sequence_id])
+            else:
+                prev = int(0)
+
+            request = row.get("data") or row.get("body")
+            if isinstance(request, (bytes, bytearray)):
+                request = request.decode("utf-8")
+
+            val = prev + int(request)
+            self.cache[sequence_id] = val
+            results.append(val)
+
+        return results
+
+    def inference(self, data, *args, **kwargs):
+        return data
+
+    def postprocess(self, data):
+        """
+        The post process function makes use of the output from the inference and converts into a
+        Torchserve supported response output.
+
+        Returns:
+            List: The post process function returns a list of the predicted output.
+        """
+
+        return data