diff --git a/python/hsml/client/exceptions.py b/python/hsml/client/exceptions.py
index b2394c188..6a59909db 100644
--- a/python/hsml/client/exceptions.py
+++ b/python/hsml/client/exceptions.py
@@ -71,6 +71,13 @@ class ModelServingException(Exception):
     ERROR_CODE_DEPLOYMENT_NOT_RUNNING = 250001
 
 
+class InternalClientError(TypeError):
+    """Raised when internal client cannot be initialized due to missing arguments."""
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
 class ExternalClientError(TypeError):
     """Raised when external client cannot be initialized due to missing arguments."""
 
diff --git a/python/hsml/client/hopsworks/base.py b/python/hsml/client/hopsworks/base.py
index a33afd86b..6b3ee719a 100644
--- a/python/hsml/client/hopsworks/base.py
+++ b/python/hsml/client/hopsworks/base.py
@@ -105,7 +105,7 @@ def _close(self):
         """Closes a client. Can be implemented for clean up purposes, not mandatory."""
         self._connected = False
 
-    def replace_public_host(self, url):
+    def _replace_public_host(self, url):
         """replace hostname to public hostname set in HOPSWORKS_PUBLIC_HOST"""
         ui_url = url._replace(netloc=os.environ[self.HOPSWORKS_PUBLIC_HOST])
         return ui_url
diff --git a/python/hsml/client/hopsworks/external.py b/python/hsml/client/hopsworks/external.py
index 762d3072a..ec66c51a8 100644
--- a/python/hsml/client/hopsworks/external.py
+++ b/python/hsml/client/hopsworks/external.py
@@ -77,7 +77,7 @@ def _get_project_info(self, project_name):
         """
         return self._send_request("GET", ["project", "getProjectInfo", project_name])
 
-    def replace_public_host(self, url):
+    def _replace_public_host(self, url):
         """no need to replace as we are already in external client"""
         return url
 
diff --git a/python/hsml/client/istio/base.py b/python/hsml/client/istio/base.py
index 2e4ee4208..9aaab9ba0 100644
--- a/python/hsml/client/istio/base.py
+++ b/python/hsml/client/istio/base.py
@@ -17,12 +17,12 @@
 import os
 from abc import abstractmethod
 
-from hsml.client import base, exceptions
+from hsml.client import base
+from hsml.client.istio.grpc.inference_client import GRPCInferenceServerClient
 
 
 class Client(base.Client):
     SERVING_API_KEY = "SERVING_API_KEY"
-    ISTIO_ENDPOINT = "ISTIO_ENDPOINT"
     HOPSWORKS_PUBLIC_HOST = "HOPSWORKS_PUBLIC_HOST"
 
     BASE_PATH_PARAMS = []
@@ -80,17 +80,18 @@ def _get_host_port_pair(self):
         host, port = endpoint.split(":")
         return host, port
 
-    def _get_serving_api_key(self):
-        """Retrieve serving API key from environment variable."""
-        if self.SERVING_API_KEY not in os.environ:
-            raise exceptions.ExternalClientError("Serving API key not found")
-        return os.environ[self.SERVING_API_KEY]
-
     def _close(self):
         """Closes a client. Can be implemented for clean up purposes, not mandatory."""
         self._connected = False
 
-    def replace_public_host(self, url):
+    def _replace_public_host(self, url):
         """replace hostname to public hostname set in HOPSWORKS_PUBLIC_HOST"""
         ui_url = url._replace(netloc=os.environ[self.HOPSWORKS_PUBLIC_HOST])
         return ui_url
+
+    def _create_grpc_channel(self, service_hostname: str) -> GRPCInferenceServerClient:
+        return GRPCInferenceServerClient(
+            url=self._host + ":" + str(self._port),
+            channel_args=(("grpc.ssl_target_name_override", service_hostname),),
+            serving_api_key=self._auth._token,
+        )
diff --git a/python/hsml/client/istio/external.py b/python/hsml/client/istio/external.py
index 0dc87fb7b..d6c47b612 100644
--- a/python/hsml/client/istio/external.py
+++ b/python/hsml/client/istio/external.py
@@ -48,7 +48,7 @@ def _close(self):
         """Closes a client."""
         self._connected = False
 
-    def replace_public_host(self, url):
+    def _replace_public_host(self, url):
         """no need to replace as we are already in external client"""
         return url
 
diff --git a/python/hsml/client/istio/grpc/__init__.py b/python/hsml/client/istio/grpc/__init__.py
new file mode 100644
index 000000000..ff8055b9b
--- /dev/null
+++ b/python/hsml/client/istio/grpc/__init__.py
@@ -0,0 +1,15 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
diff --git a/python/hsml/client/istio/grpc/errors.py b/python/hsml/client/istio/grpc/errors.py
new file mode 100644
index 000000000..062630bea
--- /dev/null
+++ b/python/hsml/client/istio/grpc/errors.py
@@ -0,0 +1,30 @@
+# Copyright 2022 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This implementation has been borrowed from the kserve/kserve repository
+# https://github.com/kserve/kserve/blob/release-0.11/python/kserve/kserve/errors.py
+
+
+class InvalidInput(ValueError):
+    """
+    Exception class indicating invalid input arguments.
+    HTTP Servers should return HTTP_400 (Bad Request).
+    """
+
+    def __init__(self, reason):
+        self.reason = reason
+
+    def __str__(self):
+        return self.reason
diff --git a/python/hsml/client/istio/grpc/exceptions.py b/python/hsml/client/istio/grpc/exceptions.py
new file mode 100644
index 000000000..6477c9488
--- /dev/null
+++ b/python/hsml/client/istio/grpc/exceptions.py
@@ -0,0 +1,123 @@
+# Copyright 2023 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
+# This implementation has been borrowed from kserve/kserve repository
+# https://github.com/kserve/kserve/blob/release-0.11/python/kserve/kserve/exceptions.py
+
+import six
+
+
+class OpenApiException(Exception):
+    """The base exception class for all OpenAPIExceptions"""
+
+
+class ApiTypeError(OpenApiException, TypeError):
+    def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None):
+        """Raises an exception for TypeErrors
+
+        Args:
+            msg (str): the exception message
+
+        Keyword Args:
+            path_to_item (list): a list of keys an indices to get to the
+                                 current_item
+                                 None if unset
+            valid_classes (tuple): the primitive classes that current item
+                                   should be an instance of
+                                   None if unset
+            key_type (bool): False if our value is a value in a dict
+                             True if it is a key in a dict
+                             False if our item is an item in a list
+                             None if unset
+        """
+        self.path_to_item = path_to_item
+        self.valid_classes = valid_classes
+        self.key_type = key_type
+        full_msg = msg
+        if path_to_item:
+            full_msg = "{0} at {1}".format(msg, render_path(path_to_item))
+        super(ApiTypeError, self).__init__(full_msg)
+
+
+class ApiValueError(OpenApiException, ValueError):
+    def __init__(self, msg, path_to_item=None):
+        """
+        Args:
+            msg (str): the exception message
+
+        Keyword Args:
+            path_to_item (list) the path to the exception in the
+                received_data dict. None if unset
+        """
+
+        self.path_to_item = path_to_item
+        full_msg = msg
+        if path_to_item:
+            full_msg = "{0} at {1}".format(msg, render_path(path_to_item))
+        super(ApiValueError, self).__init__(full_msg)
+
+
+class ApiKeyError(OpenApiException, KeyError):
+    def __init__(self, msg, path_to_item=None):
+        """
+        Args:
+            msg (str): the exception message
+
+        Keyword Args:
+            path_to_item (None/list) the path to the exception in the
+                received_data dict
+        """
+        self.path_to_item = path_to_item
+        full_msg = msg
+        if path_to_item:
+            full_msg = "{0} at {1}".format(msg, render_path(path_to_item))
+        super(ApiKeyError, self).__init__(full_msg)
+
+
+class ApiException(OpenApiException):
+    def __init__(self, status=None, reason=None, http_resp=None):
+        if http_resp:
+            self.status = http_resp.status
+            self.reason = http_resp.reason
+            self.body = http_resp.data
+            self.headers = http_resp.getheaders()
+        else:
+            self.status = status
+            self.reason = reason
+            self.body = None
+            self.headers = None
+
+    def __str__(self):
+        """Custom error messages for exception"""
+        error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason)
+        if self.headers:
+            error_message += "HTTP response headers: {0}\n".format(self.headers)
+
+        if self.body:
+            error_message += "HTTP response body: {0}\n".format(self.body)
+
+        return error_message
+
+
+def render_path(path_to_item):
+    """Returns a string representation of a path"""
+    result = ""
+    for pth in path_to_item:
+        if isinstance(pth, six.integer_types):
+            result += "[{0}]".format(pth)
+        else:
+            result += "['{0}']".format(pth)
+    return result
diff --git a/python/hsml/client/istio/grpc/inference_client.py b/python/hsml/client/istio/grpc/inference_client.py
new file mode 100644
index 000000000..7bfa51ede
--- /dev/null
+++ b/python/hsml/client/istio/grpc/inference_client.py
@@ -0,0 +1,75 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import grpc
+
+from hsml.client.istio.utils.infer_type import InferRequest, InferResponse
+from hsml.client.istio.grpc.proto.grpc_predict_v2_pb2_grpc import (
+    GRPCInferenceServiceStub,
+)
+
+
+class GRPCInferenceServerClient:
+    def __init__(
+        self,
+        url,
+        serving_api_key,
+        channel_args=None,
+    ):
+        if channel_args is not None:
+            channel_opt = channel_args
+        else:
+            channel_opt = [
+                ("grpc.max_send_message_length", -1),
+                ("grpc.max_receive_message_length", -1),
+            ]
+
+        # Authentication is done via API Key in the Authorization header
+        self._channel = grpc.insecure_channel(url, options=channel_opt)
+        self._client_stub = GRPCInferenceServiceStub(self._channel)
+        self._serving_api_key = serving_api_key
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+    def __del__(self):
+        """It is called during object garbage collection."""
+        self.close()
+
+    def close(self):
+        """Close the client. Future calls to server will result in an Error."""
+        self._channel.close()
+
+    def infer(self, infer_request: InferRequest, headers=None, client_timeout=None):
+        headers = {} if headers is None else headers
+        headers["authorization"] = "ApiKey " + self._serving_api_key
+        metadata = headers.items()
+
+        # convert the InferRequest to a ModelInferRequest message
+        request = infer_request.to_grpc()
+
+        try:
+            # send request
+            model_infer_response = self._client_stub.ModelInfer(
+                request=request, metadata=metadata, timeout=client_timeout
+            )
+        except grpc.RpcError as rpc_error:
+            raise rpc_error
+
+        # convert back the ModelInferResponse message to InferResponse
+        return InferResponse.from_grpc(model_infer_response)
diff --git a/python/hsml/client/istio/grpc/proto/__init__.py b/python/hsml/client/istio/grpc/proto/__init__.py
new file mode 100644
index 000000000..ff8055b9b
--- /dev/null
+++ b/python/hsml/client/istio/grpc/proto/__init__.py
@@ -0,0 +1,15 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
diff --git a/python/hsml/client/istio/grpc/proto/grpc_predict_v2.proto b/python/hsml/client/istio/grpc/proto/grpc_predict_v2.proto
new file mode 100644
index 000000000..c05221d73
--- /dev/null
+++ b/python/hsml/client/istio/grpc/proto/grpc_predict_v2.proto
@@ -0,0 +1,362 @@
+// Copyright 2022 The KServe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+package inference;
+
+// Inference Server GRPC endpoints.
+service GRPCInferenceService
+{
+  // The ServerLive API indicates if the inference server is able to receive
+  // and respond to metadata and inference requests.
+  rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {}
+
+  // The ServerReady API indicates if the server is ready for inferencing.
+  rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}
+
+  // The ModelReady API indicates if a specific model is ready for inferencing.
+  rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {}
+
+  // The ServerMetadata API provides information about the server. Errors are
+  // indicated by the google.rpc.Status returned for the request. The OK code
+  // indicates success and other codes indicate failure.
+  rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {}
+
+  // The per-model metadata API provides information about a model. Errors are
+  // indicated by the google.rpc.Status returned for the request. The OK code
+  // indicates success and other codes indicate failure.
+  rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {}
+
+  // The ModelInfer API performs inference using the specified model. Errors are
+  // indicated by the google.rpc.Status returned for the request. The OK code
+  // indicates success and other codes indicate failure.
+  rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {}
+
+  // Load or reload a model from a repository.
+  rpc RepositoryModelLoad(RepositoryModelLoadRequest) returns (RepositoryModelLoadResponse) {}
+
+  // Unload a model.
+  rpc RepositoryModelUnload(RepositoryModelUnloadRequest) returns (RepositoryModelUnloadResponse) {}
+}
+
+message ServerLiveRequest {}
+
+message ServerLiveResponse
+{
+  // True if the inference server is live, false if not live.
+  bool live = 1;
+}
+
+message ServerReadyRequest {}
+
+message ServerReadyResponse
+{
+  // True if the inference server is ready, false if not ready.
+  bool ready = 1;
+}
+
+message ModelReadyRequest
+{
+  // The name of the model to check for readiness.
+  string name = 1;
+
+  // The version of the model to check for readiness. If not given the
+  // server will choose a version based on the model and internal policy.
+  string version = 2;
+}
+
+message ModelReadyResponse
+{
+  // True if the model is ready, false if not ready.
+  bool ready = 1;
+}
+
+message ServerMetadataRequest {}
+
+message ServerMetadataResponse
+{
+  // The server name.
+  string name = 1;
+
+  // The server version.
+  string version = 2;
+
+  // The extensions supported by the server.
+  repeated string extensions = 3;
+}
+
+message ModelMetadataRequest
+{
+  // The name of the model.
+  string name = 1;
+
+  // The version of the model to check for readiness. If not given the
+  // server will choose a version based on the model and internal policy.
+  string version = 2;
+}
+
+message ModelMetadataResponse
+{
+  // Metadata for a tensor.
+  message TensorMetadata
+  {
+    // The tensor name.
+    string name = 1;
+
+    // The tensor data type.
+    string datatype = 2;
+
+    // The tensor shape. A variable-size dimension is represented
+    // by a -1 value.
+    repeated int64 shape = 3;
+  }
+
+  // The model name.
+  string name = 1;
+
+  // The versions of the model available on the server.
+  repeated string versions = 2;
+
+  // The model's platform. See Platforms.
+  string platform = 3;
+
+  // The model's inputs.
+  repeated TensorMetadata inputs = 4;
+
+  // The model's outputs.
+  repeated TensorMetadata outputs = 5;
+}
+
+message ModelInferRequest
+{
+  // An input tensor for an inference request.
+  message InferInputTensor
+  {
+    // The tensor name.
+    string name = 1;
+
+    // The tensor data type.
+    string datatype = 2;
+
+    // The tensor shape.
+    repeated int64 shape = 3;
+
+    // Optional inference input tensor parameters.
+    map<string, InferParameter> parameters = 4;
+
+    // The tensor contents using a data-type format. This field must
+    // not be specified if "raw" tensor contents are being used for
+    // the inference request.
+    InferTensorContents contents = 5;
+  }
+
+  // An output tensor requested for an inference request.
+  message InferRequestedOutputTensor
+  {
+    // The tensor name.
+    string name = 1;
+
+    // Optional requested output tensor parameters.
+    map<string, InferParameter> parameters = 2;
+  }
+
+  // The name of the model to use for inferencing.
+  string model_name = 1;
+
+  // The version of the model to use for inference. If not given the
+  // server will choose a version based on the model and internal policy.
+  string model_version = 2;
+
+  // Optional identifier for the request. If specified will be
+  // returned in the response.
+  string id = 3;
+
+  // Optional inference parameters.
+  map<string, InferParameter> parameters = 4;
+
+  // The input tensors for the inference.
+  repeated InferInputTensor inputs = 5;
+
+  // The requested output tensors for the inference. Optional, if not
+  // specified all outputs produced by the model will be returned.
+  repeated InferRequestedOutputTensor outputs = 6;
+
+  // The data contained in an input tensor can be represented in "raw"
+  // bytes form or in the repeated type that matches the tensor's data
+  // type. To use the raw representation 'raw_input_contents' must be
+  // initialized with data for each tensor in the same order as
+  // 'inputs'. For each tensor, the size of this content must match
+  // what is expected by the tensor's shape and data type. The raw
+  // data must be the flattened, one-dimensional, row-major order of
+  // the tensor elements without any stride or padding between the
+  // elements. Note that the FP16 and BF16 data types must be represented as
+  // raw content as there is no specific data type for a 16-bit float type.
+  //
+  // If this field is specified then InferInputTensor::contents must
+  // not be specified for any input tensor.
+  repeated bytes raw_input_contents = 7;
+}
+
+message ModelInferResponse
+{
+  // An output tensor returned for an inference request.
+  message InferOutputTensor
+  {
+    // The tensor name.
+    string name = 1;
+
+    // The tensor data type.
+    string datatype = 2;
+
+    // The tensor shape.
+    repeated int64 shape = 3;
+
+    // Optional output tensor parameters.
+    map<string, InferParameter> parameters = 4;
+
+    // The tensor contents using a data-type format. This field must
+    // not be specified if "raw" tensor contents are being used for
+    // the inference response.
+    InferTensorContents contents = 5;
+  }
+
+  // The name of the model used for inference.
+  string model_name = 1;
+
+  // The version of the model used for inference.
+  string model_version = 2;
+
+  // The id of the inference request if one was specified.
+  string id = 3;
+
+  // Optional inference response parameters.
+  map<string, InferParameter> parameters = 4;
+
+  // The output tensors holding inference results.
+  repeated InferOutputTensor outputs = 5;
+
+  // The data contained in an output tensor can be represented in
+  // "raw" bytes form or in the repeated type that matches the
+  // tensor's data type. To use the raw representation 'raw_output_contents'
+  // must be initialized with data for each tensor in the same order as
+  // 'outputs'. For each tensor, the size of this content must match
+  // what is expected by the tensor's shape and data type. The raw
+  // data must be the flattened, one-dimensional, row-major order of
+  // the tensor elements without any stride or padding between the
+  // elements. Note that the FP16 and BF16 data types must be represented as
+  // raw content as there is no specific data type for a 16-bit float type.
+  //
+  // If this field is specified then InferOutputTensor::contents must
+  // not be specified for any output tensor.
+  repeated bytes raw_output_contents = 6;
+}
+
+// An inference parameter value. The Parameters message describes a
+// “name”/”value” pair, where the “name” is the name of the parameter
+// and the “value” is a boolean, integer, or string corresponding to
+// the parameter.
+message InferParameter
+{
+  // The parameter value can be a string, an int64, a boolean
+  // or a message specific to a predefined parameter.
+  oneof parameter_choice
+  {
+    // A boolean parameter value.
+    bool bool_param = 1;
+
+    // An int64 parameter value.
+    int64 int64_param = 2;
+
+    // A string parameter value.
+    string string_param = 3;
+  }
+}
+
+// The data contained in a tensor represented by the repeated type
+// that matches the tensor's data type. Protobuf oneof is not used
+// because oneofs cannot contain repeated fields.
+message InferTensorContents
+{
+  // Representation for BOOL data type. The size must match what is
+  // expected by the tensor's shape. The contents must be the flattened,
+  // one-dimensional, row-major order of the tensor elements.
+  repeated bool bool_contents = 1;
+
+  // Representation for INT8, INT16, and INT32 data types. The size
+  // must match what is expected by the tensor's shape. The contents
+  // must be the flattened, one-dimensional, row-major order of the
+  // tensor elements.
+  repeated int32 int_contents = 2;
+
+  // Representation for INT64 data types. The size must match what
+  // is expected by the tensor's shape. The contents must be the
+  // flattened, one-dimensional, row-major order of the tensor elements.
+  repeated int64 int64_contents = 3;
+
+  // Representation for UINT8, UINT16, and UINT32 data types. The size
+  // must match what is expected by the tensor's shape. The contents
+  // must be the flattened, one-dimensional, row-major order of the
+  // tensor elements.
+  repeated uint32 uint_contents = 4;
+
+  // Representation for UINT64 data types. The size must match what
+  // is expected by the tensor's shape. The contents must be the
+  // flattened, one-dimensional, row-major order of the tensor elements.
+  repeated uint64 uint64_contents = 5;
+
+  // Representation for FP32 data type. The size must match what is
+  // expected by the tensor's shape. The contents must be the flattened,
+  // one-dimensional, row-major order of the tensor elements.
+  repeated float fp32_contents = 6;
+
+  // Representation for FP64 data type. The size must match what is
+  // expected by the tensor's shape. The contents must be the flattened,
+  // one-dimensional, row-major order of the tensor elements.
+  repeated double fp64_contents = 7;
+
+  // Representation for BYTES data type. The size must match what is
+  // expected by the tensor's shape. The contents must be the flattened,
+  // one-dimensional, row-major order of the tensor elements.
+  repeated bytes bytes_contents = 8;
+}
+
+message RepositoryModelLoadRequest
+{
+  // The name of the model to load, or reload.
+  string model_name = 1;
+}
+
+message RepositoryModelLoadResponse
+{
+  // The name of the model trying to load or reload.
+  string model_name = 1;
+
+  // boolean parameter to indicate whether model is loaded or not
+  bool isLoaded = 2;
+}
+
+message RepositoryModelUnloadRequest
+{
+  // The name of the model to unload.
+  string model_name = 1;
+}
+
+message RepositoryModelUnloadResponse
+{
+  // The name of the model trying to load or reload.
+  string model_name = 1;
+
+  // boolean parameter to indicate whether model is unloaded or not
+  bool isUnloaded = 2;
+}
diff --git a/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2.py b/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2.py
new file mode 100644
index 000000000..a0b035d7e
--- /dev/null
+++ b/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2.py
@@ -0,0 +1,452 @@
+# Copyright 2022 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: grpc_predict_v2.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x15grpc_predict_v2.proto\x12\tinference"\x13\n\x11ServerLiveRequest""\n\x12ServerLiveResponse\x12\x0c\n\x04live\x18\x01 \x01(\x08"\x14\n\x12ServerReadyRequest"$\n\x13ServerReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"2\n\x11ModelReadyRequest\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t"#\n\x12ModelReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"\x17\n\x15ServerMetadataRequest"K\n\x16ServerMetadataResponse\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12\x12\n\nextensions\x18\x03 \x03(\t"5\n\x14ModelMetadataRequest\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t"\x8d\x02\n\x15ModelMetadataResponse\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\t\x12\x10\n\x08platform\x18\x03 \x01(\t\x12?\n\x06inputs\x18\x04 \x03(\x0b\x32/.inference.ModelMetadataResponse.TensorMetadata\x12@\n\x07outputs\x18\x05 \x03(\x0b\x32/.inference.ModelMetadataResponse.TensorMetadata\x1a?\n\x0eTensorMetadata\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64\x61tatype\x18\x02 \x01(\t\x12\r\n\x05shape\x18\x03 \x03(\x03"\xee\x06\n\x11ModelInferRequest\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x15\n\rmodel_version\x18\x02 \x01(\t\x12\n\n\x02id\x18\x03 \x01(\t\x12@\n\nparameters\x18\x04 \x03(\x0b\x32,.inference.ModelInferRequest.ParametersEntry\x12=\n\x06inputs\x18\x05 \x03(\x0b\x32-.inference.ModelInferRequest.InferInputTensor\x12H\n\x07outputs\x18\x06 \x03(\x0b\x32\x37.inference.ModelInferRequest.InferRequestedOutputTensor\x12\x1a\n\x12raw_input_contents\x18\x07 \x03(\x0c\x1a\x94\x02\n\x10InferInputTensor\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64\x61tatype\x18\x02 \x01(\t\x12\r\n\x05shape\x18\x03 \x03(\x03\x12Q\n\nparameters\x18\x04 \x03(\x0b\x32=.inference.ModelInferRequest.InferInputTensor.ParametersEntry\x12\x30\n\x08\x63ontents\x18\x05 \x01(\x0b\x32\x1e.inference.InferTensorContents\x1aL\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.inference.InferParameter:\x02\x38\x01\x1a\xd5\x01\n\x1aInferRequestedOutputTensor\x12\x0c\n\x04name\x18\x01 \x01(\t\x12[\n\nparameters\x18\x02 \x03(\x0b\x32G.inference.ModelInferRequest.InferRequestedOutputTensor.ParametersEntry\x1aL\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.inference.InferParameter:\x02\x38\x01\x1aL\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.inference.InferParameter:\x02\x38\x01"\xd5\x04\n\x12ModelInferResponse\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x15\n\rmodel_version\x18\x02 \x01(\t\x12\n\n\x02id\x18\x03 \x01(\t\x12\x41\n\nparameters\x18\x04 \x03(\x0b\x32-.inference.ModelInferResponse.ParametersEntry\x12@\n\x07outputs\x18\x05 \x03(\x0b\x32/.inference.ModelInferResponse.InferOutputTensor\x12\x1b\n\x13raw_output_contents\x18\x06 \x03(\x0c\x1a\x97\x02\n\x11InferOutputTensor\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64\x61tatype\x18\x02 \x01(\t\x12\r\n\x05shape\x18\x03 \x03(\x03\x12S\n\nparameters\x18\x04 \x03(\x0b\x32?.inference.ModelInferResponse.InferOutputTensor.ParametersEntry\x12\x30\n\x08\x63ontents\x18\x05 \x01(\x0b\x32\x1e.inference.InferTensorContents\x1aL\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.inference.InferParameter:\x02\x38\x01\x1aL\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.inference.InferParameter:\x02\x38\x01"i\n\x0eInferParameter\x12\x14\n\nbool_param\x18\x01 \x01(\x08H\x00\x12\x15\n\x0bint64_param\x18\x02 \x01(\x03H\x00\x12\x16\n\x0cstring_param\x18\x03 \x01(\tH\x00\x42\x12\n\x10parameter_choice"\xd0\x01\n\x13InferTensorContents\x12\x15\n\rbool_contents\x18\x01 \x03(\x08\x12\x14\n\x0cint_contents\x18\x02 \x03(\x05\x12\x16\n\x0eint64_contents\x18\x03 \x03(\x03\x12\x15\n\ruint_contents\x18\x04 \x03(\r\x12\x17\n\x0fuint64_contents\x18\x05 \x03(\x04\x12\x15\n\rfp32_contents\x18\x06 \x03(\x02\x12\x15\n\rfp64_contents\x18\x07 \x03(\x01\x12\x16\n\x0e\x62ytes_contents\x18\x08 \x03(\x0c"0\n\x1aRepositoryModelLoadRequest\x12\x12\n\nmodel_name\x18\x01 \x01(\t"C\n\x1bRepositoryModelLoadResponse\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x10\n\x08isLoaded\x18\x02 \x01(\x08"2\n\x1cRepositoryModelUnloadRequest\x12\x12\n\nmodel_name\x18\x01 \x01(\t"G\n\x1dRepositoryModelUnloadResponse\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x12\n\nisUnloaded\x18\x02 \x01(\x08\x32\xd2\x05\n\x14GRPCInferenceService\x12K\n\nServerLive\x12\x1c.inference.ServerLiveRequest\x1a\x1d.inference.ServerLiveResponse"\x00\x12N\n\x0bServerReady\x12\x1d.inference.ServerReadyRequest\x1a\x1e.inference.ServerReadyResponse"\x00\x12K\n\nModelReady\x12\x1c.inference.ModelReadyRequest\x1a\x1d.inference.ModelReadyResponse"\x00\x12W\n\x0eServerMetadata\x12 .inference.ServerMetadataRequest\x1a!.inference.ServerMetadataResponse"\x00\x12T\n\rModelMetadata\x12\x1f.inference.ModelMetadataRequest\x1a .inference.ModelMetadataResponse"\x00\x12K\n\nModelInfer\x12\x1c.inference.ModelInferRequest\x1a\x1d.inference.ModelInferResponse"\x00\x12\x66\n\x13RepositoryModelLoad\x12%.inference.RepositoryModelLoadRequest\x1a&.inference.RepositoryModelLoadResponse"\x00\x12l\n\x15RepositoryModelUnload\x12\'.inference.RepositoryModelUnloadRequest\x1a(.inference.RepositoryModelUnloadResponse"\x00\x62\x06proto3'
+)
+
+
+_SERVERLIVEREQUEST = DESCRIPTOR.message_types_by_name["ServerLiveRequest"]
+_SERVERLIVERESPONSE = DESCRIPTOR.message_types_by_name["ServerLiveResponse"]
+_SERVERREADYREQUEST = DESCRIPTOR.message_types_by_name["ServerReadyRequest"]
+_SERVERREADYRESPONSE = DESCRIPTOR.message_types_by_name["ServerReadyResponse"]
+_MODELREADYREQUEST = DESCRIPTOR.message_types_by_name["ModelReadyRequest"]
+_MODELREADYRESPONSE = DESCRIPTOR.message_types_by_name["ModelReadyResponse"]
+_SERVERMETADATAREQUEST = DESCRIPTOR.message_types_by_name["ServerMetadataRequest"]
+_SERVERMETADATARESPONSE = DESCRIPTOR.message_types_by_name["ServerMetadataResponse"]
+_MODELMETADATAREQUEST = DESCRIPTOR.message_types_by_name["ModelMetadataRequest"]
+_MODELMETADATARESPONSE = DESCRIPTOR.message_types_by_name["ModelMetadataResponse"]
+_MODELMETADATARESPONSE_TENSORMETADATA = _MODELMETADATARESPONSE.nested_types_by_name[
+    "TensorMetadata"
+]
+_MODELINFERREQUEST = DESCRIPTOR.message_types_by_name["ModelInferRequest"]
+_MODELINFERREQUEST_INFERINPUTTENSOR = _MODELINFERREQUEST.nested_types_by_name[
+    "InferInputTensor"
+]
+_MODELINFERREQUEST_INFERINPUTTENSOR_PARAMETERSENTRY = (
+    _MODELINFERREQUEST_INFERINPUTTENSOR.nested_types_by_name["ParametersEntry"]
+)
+_MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR = _MODELINFERREQUEST.nested_types_by_name[
+    "InferRequestedOutputTensor"
+]
+_MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR_PARAMETERSENTRY = (
+    _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR.nested_types_by_name[
+        "ParametersEntry"
+    ]
+)
+_MODELINFERREQUEST_PARAMETERSENTRY = _MODELINFERREQUEST.nested_types_by_name[
+    "ParametersEntry"
+]
+_MODELINFERRESPONSE = DESCRIPTOR.message_types_by_name["ModelInferResponse"]
+_MODELINFERRESPONSE_INFEROUTPUTTENSOR = _MODELINFERRESPONSE.nested_types_by_name[
+    "InferOutputTensor"
+]
+_MODELINFERRESPONSE_INFEROUTPUTTENSOR_PARAMETERSENTRY = (
+    _MODELINFERRESPONSE_INFEROUTPUTTENSOR.nested_types_by_name["ParametersEntry"]
+)
+_MODELINFERRESPONSE_PARAMETERSENTRY = _MODELINFERRESPONSE.nested_types_by_name[
+    "ParametersEntry"
+]
+_INFERPARAMETER = DESCRIPTOR.message_types_by_name["InferParameter"]
+_INFERTENSORCONTENTS = DESCRIPTOR.message_types_by_name["InferTensorContents"]
+_REPOSITORYMODELLOADREQUEST = DESCRIPTOR.message_types_by_name[
+    "RepositoryModelLoadRequest"
+]
+_REPOSITORYMODELLOADRESPONSE = DESCRIPTOR.message_types_by_name[
+    "RepositoryModelLoadResponse"
+]
+_REPOSITORYMODELUNLOADREQUEST = DESCRIPTOR.message_types_by_name[
+    "RepositoryModelUnloadRequest"
+]
+_REPOSITORYMODELUNLOADRESPONSE = DESCRIPTOR.message_types_by_name[
+    "RepositoryModelUnloadResponse"
+]
+ServerLiveRequest = _reflection.GeneratedProtocolMessageType(
+    "ServerLiveRequest",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _SERVERLIVEREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ServerLiveRequest)
+    },
+)
+_sym_db.RegisterMessage(ServerLiveRequest)
+
+ServerLiveResponse = _reflection.GeneratedProtocolMessageType(
+    "ServerLiveResponse",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _SERVERLIVERESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ServerLiveResponse)
+    },
+)
+_sym_db.RegisterMessage(ServerLiveResponse)
+
+ServerReadyRequest = _reflection.GeneratedProtocolMessageType(
+    "ServerReadyRequest",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _SERVERREADYREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ServerReadyRequest)
+    },
+)
+_sym_db.RegisterMessage(ServerReadyRequest)
+
+ServerReadyResponse = _reflection.GeneratedProtocolMessageType(
+    "ServerReadyResponse",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _SERVERREADYRESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ServerReadyResponse)
+    },
+)
+_sym_db.RegisterMessage(ServerReadyResponse)
+
+ModelReadyRequest = _reflection.GeneratedProtocolMessageType(
+    "ModelReadyRequest",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _MODELREADYREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ModelReadyRequest)
+    },
+)
+_sym_db.RegisterMessage(ModelReadyRequest)
+
+ModelReadyResponse = _reflection.GeneratedProtocolMessageType(
+    "ModelReadyResponse",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _MODELREADYRESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ModelReadyResponse)
+    },
+)
+_sym_db.RegisterMessage(ModelReadyResponse)
+
+ServerMetadataRequest = _reflection.GeneratedProtocolMessageType(
+    "ServerMetadataRequest",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _SERVERMETADATAREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ServerMetadataRequest)
+    },
+)
+_sym_db.RegisterMessage(ServerMetadataRequest)
+
+ServerMetadataResponse = _reflection.GeneratedProtocolMessageType(
+    "ServerMetadataResponse",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _SERVERMETADATARESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ServerMetadataResponse)
+    },
+)
+_sym_db.RegisterMessage(ServerMetadataResponse)
+
+ModelMetadataRequest = _reflection.GeneratedProtocolMessageType(
+    "ModelMetadataRequest",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _MODELMETADATAREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ModelMetadataRequest)
+    },
+)
+_sym_db.RegisterMessage(ModelMetadataRequest)
+
+ModelMetadataResponse = _reflection.GeneratedProtocolMessageType(
+    "ModelMetadataResponse",
+    (_message.Message,),
+    {
+        "TensorMetadata": _reflection.GeneratedProtocolMessageType(
+            "TensorMetadata",
+            (_message.Message,),
+            {
+                "DESCRIPTOR": _MODELMETADATARESPONSE_TENSORMETADATA,
+                "__module__": "grpc_predict_v2_pb2",
+                # @@protoc_insertion_point(class_scope:inference.ModelMetadataResponse.TensorMetadata)
+            },
+        ),
+        "DESCRIPTOR": _MODELMETADATARESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ModelMetadataResponse)
+    },
+)
+_sym_db.RegisterMessage(ModelMetadataResponse)
+_sym_db.RegisterMessage(ModelMetadataResponse.TensorMetadata)
+
+ModelInferRequest = _reflection.GeneratedProtocolMessageType(
+    "ModelInferRequest",
+    (_message.Message,),
+    {
+        "InferInputTensor": _reflection.GeneratedProtocolMessageType(
+            "InferInputTensor",
+            (_message.Message,),
+            {
+                "ParametersEntry": _reflection.GeneratedProtocolMessageType(
+                    "ParametersEntry",
+                    (_message.Message,),
+                    {
+                        "DESCRIPTOR": _MODELINFERREQUEST_INFERINPUTTENSOR_PARAMETERSENTRY,
+                        "__module__": "grpc_predict_v2_pb2",
+                        # @@protoc_insertion_point(class_scope:inference.ModelInferRequest.InferInputTensor.ParametersEntry)
+                    },
+                ),
+                "DESCRIPTOR": _MODELINFERREQUEST_INFERINPUTTENSOR,
+                "__module__": "grpc_predict_v2_pb2",
+                # @@protoc_insertion_point(class_scope:inference.ModelInferRequest.InferInputTensor)
+            },
+        ),
+        "InferRequestedOutputTensor": _reflection.GeneratedProtocolMessageType(
+            "InferRequestedOutputTensor",
+            (_message.Message,),
+            {
+                "ParametersEntry": _reflection.GeneratedProtocolMessageType(
+                    "ParametersEntry",
+                    (_message.Message,),
+                    {
+                        "DESCRIPTOR": _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR_PARAMETERSENTRY,
+                        "__module__": "grpc_predict_v2_pb2",
+                        # @@protoc_insertion_point(class_scope:inference.ModelInferRequest.InferRequestedOutputTensor.ParametersEntry)
+                    },
+                ),
+                "DESCRIPTOR": _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR,
+                "__module__": "grpc_predict_v2_pb2",
+                # @@protoc_insertion_point(class_scope:inference.ModelInferRequest.InferRequestedOutputTensor)
+            },
+        ),
+        "ParametersEntry": _reflection.GeneratedProtocolMessageType(
+            "ParametersEntry",
+            (_message.Message,),
+            {
+                "DESCRIPTOR": _MODELINFERREQUEST_PARAMETERSENTRY,
+                "__module__": "grpc_predict_v2_pb2",
+                # @@protoc_insertion_point(class_scope:inference.ModelInferRequest.ParametersEntry)
+            },
+        ),
+        "DESCRIPTOR": _MODELINFERREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ModelInferRequest)
+    },
+)
+_sym_db.RegisterMessage(ModelInferRequest)
+_sym_db.RegisterMessage(ModelInferRequest.InferInputTensor)
+_sym_db.RegisterMessage(ModelInferRequest.InferInputTensor.ParametersEntry)
+_sym_db.RegisterMessage(ModelInferRequest.InferRequestedOutputTensor)
+_sym_db.RegisterMessage(ModelInferRequest.InferRequestedOutputTensor.ParametersEntry)
+_sym_db.RegisterMessage(ModelInferRequest.ParametersEntry)
+
+ModelInferResponse = _reflection.GeneratedProtocolMessageType(
+    "ModelInferResponse",
+    (_message.Message,),
+    {
+        "InferOutputTensor": _reflection.GeneratedProtocolMessageType(
+            "InferOutputTensor",
+            (_message.Message,),
+            {
+                "ParametersEntry": _reflection.GeneratedProtocolMessageType(
+                    "ParametersEntry",
+                    (_message.Message,),
+                    {
+                        "DESCRIPTOR": _MODELINFERRESPONSE_INFEROUTPUTTENSOR_PARAMETERSENTRY,
+                        "__module__": "grpc_predict_v2_pb2",
+                        # @@protoc_insertion_point(class_scope:inference.ModelInferResponse.InferOutputTensor.ParametersEntry)
+                    },
+                ),
+                "DESCRIPTOR": _MODELINFERRESPONSE_INFEROUTPUTTENSOR,
+                "__module__": "grpc_predict_v2_pb2",
+                # @@protoc_insertion_point(class_scope:inference.ModelInferResponse.InferOutputTensor)
+            },
+        ),
+        "ParametersEntry": _reflection.GeneratedProtocolMessageType(
+            "ParametersEntry",
+            (_message.Message,),
+            {
+                "DESCRIPTOR": _MODELINFERRESPONSE_PARAMETERSENTRY,
+                "__module__": "grpc_predict_v2_pb2",
+                # @@protoc_insertion_point(class_scope:inference.ModelInferResponse.ParametersEntry)
+            },
+        ),
+        "DESCRIPTOR": _MODELINFERRESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.ModelInferResponse)
+    },
+)
+_sym_db.RegisterMessage(ModelInferResponse)
+_sym_db.RegisterMessage(ModelInferResponse.InferOutputTensor)
+_sym_db.RegisterMessage(ModelInferResponse.InferOutputTensor.ParametersEntry)
+_sym_db.RegisterMessage(ModelInferResponse.ParametersEntry)
+
+InferParameter = _reflection.GeneratedProtocolMessageType(
+    "InferParameter",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _INFERPARAMETER,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.InferParameter)
+    },
+)
+_sym_db.RegisterMessage(InferParameter)
+
+InferTensorContents = _reflection.GeneratedProtocolMessageType(
+    "InferTensorContents",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _INFERTENSORCONTENTS,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.InferTensorContents)
+    },
+)
+_sym_db.RegisterMessage(InferTensorContents)
+
+RepositoryModelLoadRequest = _reflection.GeneratedProtocolMessageType(
+    "RepositoryModelLoadRequest",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _REPOSITORYMODELLOADREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.RepositoryModelLoadRequest)
+    },
+)
+_sym_db.RegisterMessage(RepositoryModelLoadRequest)
+
+RepositoryModelLoadResponse = _reflection.GeneratedProtocolMessageType(
+    "RepositoryModelLoadResponse",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _REPOSITORYMODELLOADRESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.RepositoryModelLoadResponse)
+    },
+)
+_sym_db.RegisterMessage(RepositoryModelLoadResponse)
+
+RepositoryModelUnloadRequest = _reflection.GeneratedProtocolMessageType(
+    "RepositoryModelUnloadRequest",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _REPOSITORYMODELUNLOADREQUEST,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.RepositoryModelUnloadRequest)
+    },
+)
+_sym_db.RegisterMessage(RepositoryModelUnloadRequest)
+
+RepositoryModelUnloadResponse = _reflection.GeneratedProtocolMessageType(
+    "RepositoryModelUnloadResponse",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _REPOSITORYMODELUNLOADRESPONSE,
+        "__module__": "grpc_predict_v2_pb2",
+        # @@protoc_insertion_point(class_scope:inference.RepositoryModelUnloadResponse)
+    },
+)
+_sym_db.RegisterMessage(RepositoryModelUnloadResponse)
+
+_GRPCINFERENCESERVICE = DESCRIPTOR.services_by_name["GRPCInferenceService"]
+if _descriptor._USE_C_DESCRIPTORS == False:  # noqa: E712
+
+    DESCRIPTOR._options = None
+    _MODELINFERREQUEST_INFERINPUTTENSOR_PARAMETERSENTRY._options = None
+    _MODELINFERREQUEST_INFERINPUTTENSOR_PARAMETERSENTRY._serialized_options = b"8\001"
+    _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR_PARAMETERSENTRY._options = None
+    _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR_PARAMETERSENTRY._serialized_options = (
+        b"8\001"
+    )
+    _MODELINFERREQUEST_PARAMETERSENTRY._options = None
+    _MODELINFERREQUEST_PARAMETERSENTRY._serialized_options = b"8\001"
+    _MODELINFERRESPONSE_INFEROUTPUTTENSOR_PARAMETERSENTRY._options = None
+    _MODELINFERRESPONSE_INFEROUTPUTTENSOR_PARAMETERSENTRY._serialized_options = b"8\001"
+    _MODELINFERRESPONSE_PARAMETERSENTRY._options = None
+    _MODELINFERRESPONSE_PARAMETERSENTRY._serialized_options = b"8\001"
+    _SERVERLIVEREQUEST._serialized_start = 36
+    _SERVERLIVEREQUEST._serialized_end = 55
+    _SERVERLIVERESPONSE._serialized_start = 57
+    _SERVERLIVERESPONSE._serialized_end = 91
+    _SERVERREADYREQUEST._serialized_start = 93
+    _SERVERREADYREQUEST._serialized_end = 113
+    _SERVERREADYRESPONSE._serialized_start = 115
+    _SERVERREADYRESPONSE._serialized_end = 151
+    _MODELREADYREQUEST._serialized_start = 153
+    _MODELREADYREQUEST._serialized_end = 203
+    _MODELREADYRESPONSE._serialized_start = 205
+    _MODELREADYRESPONSE._serialized_end = 240
+    _SERVERMETADATAREQUEST._serialized_start = 242
+    _SERVERMETADATAREQUEST._serialized_end = 265
+    _SERVERMETADATARESPONSE._serialized_start = 267
+    _SERVERMETADATARESPONSE._serialized_end = 342
+    _MODELMETADATAREQUEST._serialized_start = 344
+    _MODELMETADATAREQUEST._serialized_end = 397
+    _MODELMETADATARESPONSE._serialized_start = 400
+    _MODELMETADATARESPONSE._serialized_end = 669
+    _MODELMETADATARESPONSE_TENSORMETADATA._serialized_start = 606
+    _MODELMETADATARESPONSE_TENSORMETADATA._serialized_end = 669
+    _MODELINFERREQUEST._serialized_start = 672
+    _MODELINFERREQUEST._serialized_end = 1550
+    _MODELINFERREQUEST_INFERINPUTTENSOR._serialized_start = 980
+    _MODELINFERREQUEST_INFERINPUTTENSOR._serialized_end = 1256
+    _MODELINFERREQUEST_INFERINPUTTENSOR_PARAMETERSENTRY._serialized_start = 1180
+    _MODELINFERREQUEST_INFERINPUTTENSOR_PARAMETERSENTRY._serialized_end = 1256
+    _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR._serialized_start = 1259
+    _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR._serialized_end = 1472
+    _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR_PARAMETERSENTRY._serialized_start = (
+        1180
+    )
+    _MODELINFERREQUEST_INFERREQUESTEDOUTPUTTENSOR_PARAMETERSENTRY._serialized_end = 1256
+    _MODELINFERREQUEST_PARAMETERSENTRY._serialized_start = 1180
+    _MODELINFERREQUEST_PARAMETERSENTRY._serialized_end = 1256
+    _MODELINFERRESPONSE._serialized_start = 1553
+    _MODELINFERRESPONSE._serialized_end = 2150
+    _MODELINFERRESPONSE_INFEROUTPUTTENSOR._serialized_start = 1793
+    _MODELINFERRESPONSE_INFEROUTPUTTENSOR._serialized_end = 2072
+    _MODELINFERRESPONSE_INFEROUTPUTTENSOR_PARAMETERSENTRY._serialized_start = 1180
+    _MODELINFERRESPONSE_INFEROUTPUTTENSOR_PARAMETERSENTRY._serialized_end = 1256
+    _MODELINFERRESPONSE_PARAMETERSENTRY._serialized_start = 1180
+    _MODELINFERRESPONSE_PARAMETERSENTRY._serialized_end = 1256
+    _INFERPARAMETER._serialized_start = 2152
+    _INFERPARAMETER._serialized_end = 2257
+    _INFERTENSORCONTENTS._serialized_start = 2260
+    _INFERTENSORCONTENTS._serialized_end = 2468
+    _REPOSITORYMODELLOADREQUEST._serialized_start = 2470
+    _REPOSITORYMODELLOADREQUEST._serialized_end = 2518
+    _REPOSITORYMODELLOADRESPONSE._serialized_start = 2520
+    _REPOSITORYMODELLOADRESPONSE._serialized_end = 2587
+    _REPOSITORYMODELUNLOADREQUEST._serialized_start = 2589
+    _REPOSITORYMODELUNLOADREQUEST._serialized_end = 2639
+    _REPOSITORYMODELUNLOADRESPONSE._serialized_start = 2641
+    _REPOSITORYMODELUNLOADRESPONSE._serialized_end = 2712
+    _GRPCINFERENCESERVICE._serialized_start = 2715
+    _GRPCINFERENCESERVICE._serialized_end = 3437
+# @@protoc_insertion_point(module_scope)
diff --git a/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2.pyi b/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2.pyi
new file mode 100644
index 000000000..5407c606b
--- /dev/null
+++ b/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2.pyi
@@ -0,0 +1,399 @@
+from google.protobuf.internal import containers as _containers
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from typing import (
+    ClassVar as _ClassVar,
+    Iterable as _Iterable,
+    Mapping as _Mapping,
+    Optional as _Optional,
+    Union as _Union,
+)
+
+DESCRIPTOR: _descriptor.FileDescriptor
+
+class InferParameter(_message.Message):
+    __slots__ = ["bool_param", "int64_param", "string_param"]
+    BOOL_PARAM_FIELD_NUMBER: _ClassVar[int]
+    INT64_PARAM_FIELD_NUMBER: _ClassVar[int]
+    STRING_PARAM_FIELD_NUMBER: _ClassVar[int]
+    bool_param: bool
+    int64_param: int
+    string_param: str
+    def __init__(
+        self,
+        bool_param: bool = ...,
+        int64_param: _Optional[int] = ...,
+        string_param: _Optional[str] = ...,
+    ) -> None: ...
+
+class InferTensorContents(_message.Message):
+    __slots__ = [
+        "bool_contents",
+        "bytes_contents",
+        "fp32_contents",
+        "fp64_contents",
+        "int64_contents",
+        "int_contents",
+        "uint64_contents",
+        "uint_contents",
+    ]
+    BOOL_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    BYTES_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    FP32_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    FP64_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    INT64_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    INT_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    UINT64_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    UINT_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    bool_contents: _containers.RepeatedScalarFieldContainer[bool]
+    bytes_contents: _containers.RepeatedScalarFieldContainer[bytes]
+    fp32_contents: _containers.RepeatedScalarFieldContainer[float]
+    fp64_contents: _containers.RepeatedScalarFieldContainer[float]
+    int64_contents: _containers.RepeatedScalarFieldContainer[int]
+    int_contents: _containers.RepeatedScalarFieldContainer[int]
+    uint64_contents: _containers.RepeatedScalarFieldContainer[int]
+    uint_contents: _containers.RepeatedScalarFieldContainer[int]
+    def __init__(
+        self,
+        bool_contents: _Optional[_Iterable[bool]] = ...,
+        int_contents: _Optional[_Iterable[int]] = ...,
+        int64_contents: _Optional[_Iterable[int]] = ...,
+        uint_contents: _Optional[_Iterable[int]] = ...,
+        uint64_contents: _Optional[_Iterable[int]] = ...,
+        fp32_contents: _Optional[_Iterable[float]] = ...,
+        fp64_contents: _Optional[_Iterable[float]] = ...,
+        bytes_contents: _Optional[_Iterable[bytes]] = ...,
+    ) -> None: ...
+
+class ModelInferRequest(_message.Message):
+    __slots__ = [
+        "id",
+        "inputs",
+        "model_name",
+        "model_version",
+        "outputs",
+        "parameters",
+        "raw_input_contents",
+    ]
+
+    class InferInputTensor(_message.Message):
+        __slots__ = ["contents", "datatype", "name", "parameters", "shape"]
+
+        class ParametersEntry(_message.Message):
+            __slots__ = ["key", "value"]
+            KEY_FIELD_NUMBER: _ClassVar[int]
+            VALUE_FIELD_NUMBER: _ClassVar[int]
+            key: str
+            value: InferParameter
+            def __init__(
+                self,
+                key: _Optional[str] = ...,
+                value: _Optional[_Union[InferParameter, _Mapping]] = ...,
+            ) -> None: ...
+        CONTENTS_FIELD_NUMBER: _ClassVar[int]
+        DATATYPE_FIELD_NUMBER: _ClassVar[int]
+        NAME_FIELD_NUMBER: _ClassVar[int]
+        PARAMETERS_FIELD_NUMBER: _ClassVar[int]
+        SHAPE_FIELD_NUMBER: _ClassVar[int]
+        contents: InferTensorContents
+        datatype: str
+        name: str
+        parameters: _containers.MessageMap[str, InferParameter]
+        shape: _containers.RepeatedScalarFieldContainer[int]
+        def __init__(
+            self,
+            name: _Optional[str] = ...,
+            datatype: _Optional[str] = ...,
+            shape: _Optional[_Iterable[int]] = ...,
+            parameters: _Optional[_Mapping[str, InferParameter]] = ...,
+            contents: _Optional[_Union[InferTensorContents, _Mapping]] = ...,
+        ) -> None: ...
+
+    class InferRequestedOutputTensor(_message.Message):
+        __slots__ = ["name", "parameters"]
+
+        class ParametersEntry(_message.Message):
+            __slots__ = ["key", "value"]
+            KEY_FIELD_NUMBER: _ClassVar[int]
+            VALUE_FIELD_NUMBER: _ClassVar[int]
+            key: str
+            value: InferParameter
+            def __init__(
+                self,
+                key: _Optional[str] = ...,
+                value: _Optional[_Union[InferParameter, _Mapping]] = ...,
+            ) -> None: ...
+        NAME_FIELD_NUMBER: _ClassVar[int]
+        PARAMETERS_FIELD_NUMBER: _ClassVar[int]
+        name: str
+        parameters: _containers.MessageMap[str, InferParameter]
+        def __init__(
+            self,
+            name: _Optional[str] = ...,
+            parameters: _Optional[_Mapping[str, InferParameter]] = ...,
+        ) -> None: ...
+
+    class ParametersEntry(_message.Message):
+        __slots__ = ["key", "value"]
+        KEY_FIELD_NUMBER: _ClassVar[int]
+        VALUE_FIELD_NUMBER: _ClassVar[int]
+        key: str
+        value: InferParameter
+        def __init__(
+            self,
+            key: _Optional[str] = ...,
+            value: _Optional[_Union[InferParameter, _Mapping]] = ...,
+        ) -> None: ...
+    ID_FIELD_NUMBER: _ClassVar[int]
+    INPUTS_FIELD_NUMBER: _ClassVar[int]
+    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+    MODEL_VERSION_FIELD_NUMBER: _ClassVar[int]
+    OUTPUTS_FIELD_NUMBER: _ClassVar[int]
+    PARAMETERS_FIELD_NUMBER: _ClassVar[int]
+    RAW_INPUT_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    id: str
+    inputs: _containers.RepeatedCompositeFieldContainer[
+        ModelInferRequest.InferInputTensor
+    ]
+    model_name: str
+    model_version: str
+    outputs: _containers.RepeatedCompositeFieldContainer[
+        ModelInferRequest.InferRequestedOutputTensor
+    ]
+    parameters: _containers.MessageMap[str, InferParameter]
+    raw_input_contents: _containers.RepeatedScalarFieldContainer[bytes]
+    def __init__(
+        self,
+        model_name: _Optional[str] = ...,
+        model_version: _Optional[str] = ...,
+        id: _Optional[str] = ...,
+        parameters: _Optional[_Mapping[str, InferParameter]] = ...,
+        inputs: _Optional[
+            _Iterable[_Union[ModelInferRequest.InferInputTensor, _Mapping]]
+        ] = ...,
+        outputs: _Optional[
+            _Iterable[_Union[ModelInferRequest.InferRequestedOutputTensor, _Mapping]]
+        ] = ...,
+        raw_input_contents: _Optional[_Iterable[bytes]] = ...,
+    ) -> None: ...
+
+class ModelInferResponse(_message.Message):
+    __slots__ = [
+        "id",
+        "model_name",
+        "model_version",
+        "outputs",
+        "parameters",
+        "raw_output_contents",
+    ]
+
+    class InferOutputTensor(_message.Message):
+        __slots__ = ["contents", "datatype", "name", "parameters", "shape"]
+
+        class ParametersEntry(_message.Message):
+            __slots__ = ["key", "value"]
+            KEY_FIELD_NUMBER: _ClassVar[int]
+            VALUE_FIELD_NUMBER: _ClassVar[int]
+            key: str
+            value: InferParameter
+            def __init__(
+                self,
+                key: _Optional[str] = ...,
+                value: _Optional[_Union[InferParameter, _Mapping]] = ...,
+            ) -> None: ...
+        CONTENTS_FIELD_NUMBER: _ClassVar[int]
+        DATATYPE_FIELD_NUMBER: _ClassVar[int]
+        NAME_FIELD_NUMBER: _ClassVar[int]
+        PARAMETERS_FIELD_NUMBER: _ClassVar[int]
+        SHAPE_FIELD_NUMBER: _ClassVar[int]
+        contents: InferTensorContents
+        datatype: str
+        name: str
+        parameters: _containers.MessageMap[str, InferParameter]
+        shape: _containers.RepeatedScalarFieldContainer[int]
+        def __init__(
+            self,
+            name: _Optional[str] = ...,
+            datatype: _Optional[str] = ...,
+            shape: _Optional[_Iterable[int]] = ...,
+            parameters: _Optional[_Mapping[str, InferParameter]] = ...,
+            contents: _Optional[_Union[InferTensorContents, _Mapping]] = ...,
+        ) -> None: ...
+
+    class ParametersEntry(_message.Message):
+        __slots__ = ["key", "value"]
+        KEY_FIELD_NUMBER: _ClassVar[int]
+        VALUE_FIELD_NUMBER: _ClassVar[int]
+        key: str
+        value: InferParameter
+        def __init__(
+            self,
+            key: _Optional[str] = ...,
+            value: _Optional[_Union[InferParameter, _Mapping]] = ...,
+        ) -> None: ...
+    ID_FIELD_NUMBER: _ClassVar[int]
+    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+    MODEL_VERSION_FIELD_NUMBER: _ClassVar[int]
+    OUTPUTS_FIELD_NUMBER: _ClassVar[int]
+    PARAMETERS_FIELD_NUMBER: _ClassVar[int]
+    RAW_OUTPUT_CONTENTS_FIELD_NUMBER: _ClassVar[int]
+    id: str
+    model_name: str
+    model_version: str
+    outputs: _containers.RepeatedCompositeFieldContainer[
+        ModelInferResponse.InferOutputTensor
+    ]
+    parameters: _containers.MessageMap[str, InferParameter]
+    raw_output_contents: _containers.RepeatedScalarFieldContainer[bytes]
+    def __init__(
+        self,
+        model_name: _Optional[str] = ...,
+        model_version: _Optional[str] = ...,
+        id: _Optional[str] = ...,
+        parameters: _Optional[_Mapping[str, InferParameter]] = ...,
+        outputs: _Optional[
+            _Iterable[_Union[ModelInferResponse.InferOutputTensor, _Mapping]]
+        ] = ...,
+        raw_output_contents: _Optional[_Iterable[bytes]] = ...,
+    ) -> None: ...
+
+class ModelMetadataRequest(_message.Message):
+    __slots__ = ["name", "version"]
+    NAME_FIELD_NUMBER: _ClassVar[int]
+    VERSION_FIELD_NUMBER: _ClassVar[int]
+    name: str
+    version: str
+    def __init__(
+        self, name: _Optional[str] = ..., version: _Optional[str] = ...
+    ) -> None: ...
+
+class ModelMetadataResponse(_message.Message):
+    __slots__ = ["inputs", "name", "outputs", "platform", "versions"]
+
+    class TensorMetadata(_message.Message):
+        __slots__ = ["datatype", "name", "shape"]
+        DATATYPE_FIELD_NUMBER: _ClassVar[int]
+        NAME_FIELD_NUMBER: _ClassVar[int]
+        SHAPE_FIELD_NUMBER: _ClassVar[int]
+        datatype: str
+        name: str
+        shape: _containers.RepeatedScalarFieldContainer[int]
+        def __init__(
+            self,
+            name: _Optional[str] = ...,
+            datatype: _Optional[str] = ...,
+            shape: _Optional[_Iterable[int]] = ...,
+        ) -> None: ...
+    INPUTS_FIELD_NUMBER: _ClassVar[int]
+    NAME_FIELD_NUMBER: _ClassVar[int]
+    OUTPUTS_FIELD_NUMBER: _ClassVar[int]
+    PLATFORM_FIELD_NUMBER: _ClassVar[int]
+    VERSIONS_FIELD_NUMBER: _ClassVar[int]
+    inputs: _containers.RepeatedCompositeFieldContainer[
+        ModelMetadataResponse.TensorMetadata
+    ]
+    name: str
+    outputs: _containers.RepeatedCompositeFieldContainer[
+        ModelMetadataResponse.TensorMetadata
+    ]
+    platform: str
+    versions: _containers.RepeatedScalarFieldContainer[str]
+    def __init__(
+        self,
+        name: _Optional[str] = ...,
+        versions: _Optional[_Iterable[str]] = ...,
+        platform: _Optional[str] = ...,
+        inputs: _Optional[
+            _Iterable[_Union[ModelMetadataResponse.TensorMetadata, _Mapping]]
+        ] = ...,
+        outputs: _Optional[
+            _Iterable[_Union[ModelMetadataResponse.TensorMetadata, _Mapping]]
+        ] = ...,
+    ) -> None: ...
+
+class ModelReadyRequest(_message.Message):
+    __slots__ = ["name", "version"]
+    NAME_FIELD_NUMBER: _ClassVar[int]
+    VERSION_FIELD_NUMBER: _ClassVar[int]
+    name: str
+    version: str
+    def __init__(
+        self, name: _Optional[str] = ..., version: _Optional[str] = ...
+    ) -> None: ...
+
+class ModelReadyResponse(_message.Message):
+    __slots__ = ["ready"]
+    READY_FIELD_NUMBER: _ClassVar[int]
+    ready: bool
+    def __init__(self, ready: bool = ...) -> None: ...
+
+class RepositoryModelLoadRequest(_message.Message):
+    __slots__ = ["model_name"]
+    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+    model_name: str
+    def __init__(self, model_name: _Optional[str] = ...) -> None: ...
+
+class RepositoryModelLoadResponse(_message.Message):
+    __slots__ = ["isLoaded", "model_name"]
+    ISLOADED_FIELD_NUMBER: _ClassVar[int]
+    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+    isLoaded: bool
+    model_name: str
+    def __init__(
+        self, model_name: _Optional[str] = ..., isLoaded: bool = ...
+    ) -> None: ...
+
+class RepositoryModelUnloadRequest(_message.Message):
+    __slots__ = ["model_name"]
+    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+    model_name: str
+    def __init__(self, model_name: _Optional[str] = ...) -> None: ...
+
+class RepositoryModelUnloadResponse(_message.Message):
+    __slots__ = ["isUnloaded", "model_name"]
+    ISUNLOADED_FIELD_NUMBER: _ClassVar[int]
+    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+    isUnloaded: bool
+    model_name: str
+    def __init__(
+        self, model_name: _Optional[str] = ..., isUnloaded: bool = ...
+    ) -> None: ...
+
+class ServerLiveRequest(_message.Message):
+    __slots__ = []
+    def __init__(self) -> None: ...
+
+class ServerLiveResponse(_message.Message):
+    __slots__ = ["live"]
+    LIVE_FIELD_NUMBER: _ClassVar[int]
+    live: bool
+    def __init__(self, live: bool = ...) -> None: ...
+
+class ServerMetadataRequest(_message.Message):
+    __slots__ = []
+    def __init__(self) -> None: ...
+
+class ServerMetadataResponse(_message.Message):
+    __slots__ = ["extensions", "name", "version"]
+    EXTENSIONS_FIELD_NUMBER: _ClassVar[int]
+    NAME_FIELD_NUMBER: _ClassVar[int]
+    VERSION_FIELD_NUMBER: _ClassVar[int]
+    extensions: _containers.RepeatedScalarFieldContainer[str]
+    name: str
+    version: str
+    def __init__(
+        self,
+        name: _Optional[str] = ...,
+        version: _Optional[str] = ...,
+        extensions: _Optional[_Iterable[str]] = ...,
+    ) -> None: ...
+
+class ServerReadyRequest(_message.Message):
+    __slots__ = []
+    def __init__(self) -> None: ...
+
+class ServerReadyResponse(_message.Message):
+    __slots__ = ["ready"]
+    READY_FIELD_NUMBER: _ClassVar[int]
+    ready: bool
+    def __init__(self, ready: bool = ...) -> None: ...
diff --git a/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2_grpc.py b/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2_grpc.py
new file mode 100644
index 000000000..fc525ea05
--- /dev/null
+++ b/python/hsml/client/istio/grpc/proto/grpc_predict_v2_pb2_grpc.py
@@ -0,0 +1,419 @@
+# Copyright 2022 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import hsml.client.istio.grpc.inference_client as inference_client
+
+import hsml.client.istio.grpc.proto.grpc_predict_v2_pb2 as grpc__predict__v2__pb2
+
+
+class GRPCInferenceServiceStub(object):
+    """Inference Server GRPC endpoints."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.ServerLive = channel.unary_unary(
+            "/inference.GRPCInferenceService/ServerLive",
+            request_serializer=grpc__predict__v2__pb2.ServerLiveRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.ServerLiveResponse.FromString,
+        )
+        self.ServerReady = channel.unary_unary(
+            "/inference.GRPCInferenceService/ServerReady",
+            request_serializer=grpc__predict__v2__pb2.ServerReadyRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.ServerReadyResponse.FromString,
+        )
+        self.ModelReady = channel.unary_unary(
+            "/inference.GRPCInferenceService/ModelReady",
+            request_serializer=grpc__predict__v2__pb2.ModelReadyRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.ModelReadyResponse.FromString,
+        )
+        self.ServerMetadata = channel.unary_unary(
+            "/inference.GRPCInferenceService/ServerMetadata",
+            request_serializer=grpc__predict__v2__pb2.ServerMetadataRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.ServerMetadataResponse.FromString,
+        )
+        self.ModelMetadata = channel.unary_unary(
+            "/inference.GRPCInferenceService/ModelMetadata",
+            request_serializer=grpc__predict__v2__pb2.ModelMetadataRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.ModelMetadataResponse.FromString,
+        )
+        self.ModelInfer = channel.unary_unary(
+            "/inference.GRPCInferenceService/ModelInfer",
+            request_serializer=grpc__predict__v2__pb2.ModelInferRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.ModelInferResponse.FromString,
+        )
+        self.RepositoryModelLoad = channel.unary_unary(
+            "/inference.GRPCInferenceService/RepositoryModelLoad",
+            request_serializer=grpc__predict__v2__pb2.RepositoryModelLoadRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.RepositoryModelLoadResponse.FromString,
+        )
+        self.RepositoryModelUnload = channel.unary_unary(
+            "/inference.GRPCInferenceService/RepositoryModelUnload",
+            request_serializer=grpc__predict__v2__pb2.RepositoryModelUnloadRequest.SerializeToString,
+            response_deserializer=grpc__predict__v2__pb2.RepositoryModelUnloadResponse.FromString,
+        )
+
+
+class GRPCInferenceServiceServicer(object):
+    """Inference Server GRPC endpoints."""
+
+    def ServerLive(self, request, context):
+        """The ServerLive API indicates if the inference server is able to receive
+        and respond to metadata and inference requests.
+        """
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+    def ServerReady(self, request, context):
+        """The ServerReady API indicates if the server is ready for inferencing."""
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+    def ModelReady(self, request, context):
+        """The ModelReady API indicates if a specific model is ready for inferencing."""
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+    def ServerMetadata(self, request, context):
+        """The ServerMetadata API provides information about the server. Errors are
+        indicated by the google.rpc.Status returned for the request. The OK code
+        indicates success and other codes indicate failure.
+        """
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+    def ModelMetadata(self, request, context):
+        """The per-model metadata API provides information about a model. Errors are
+        indicated by the google.rpc.Status returned for the request. The OK code
+        indicates success and other codes indicate failure.
+        """
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+    def ModelInfer(self, request, context):
+        """The ModelInfer API performs inference using the specified model. Errors are
+        indicated by the google.rpc.Status returned for the request. The OK code
+        indicates success and other codes indicate failure.
+        """
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+    def RepositoryModelLoad(self, request, context):
+        """Load or reload a model from a repository."""
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+    def RepositoryModelUnload(self, request, context):
+        """Unload a model."""
+        context.set_code(inference_client.StatusCode.UNIMPLEMENTED)
+        context.set_details("Method not implemented!")
+        raise NotImplementedError("Method not implemented!")
+
+
+def add_GRPCInferenceServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+        "ServerLive": inference_client.unary_unary_rpc_method_handler(
+            servicer.ServerLive,
+            request_deserializer=grpc__predict__v2__pb2.ServerLiveRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.ServerLiveResponse.SerializeToString,
+        ),
+        "ServerReady": inference_client.unary_unary_rpc_method_handler(
+            servicer.ServerReady,
+            request_deserializer=grpc__predict__v2__pb2.ServerReadyRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.ServerReadyResponse.SerializeToString,
+        ),
+        "ModelReady": inference_client.unary_unary_rpc_method_handler(
+            servicer.ModelReady,
+            request_deserializer=grpc__predict__v2__pb2.ModelReadyRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.ModelReadyResponse.SerializeToString,
+        ),
+        "ServerMetadata": inference_client.unary_unary_rpc_method_handler(
+            servicer.ServerMetadata,
+            request_deserializer=grpc__predict__v2__pb2.ServerMetadataRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.ServerMetadataResponse.SerializeToString,
+        ),
+        "ModelMetadata": inference_client.unary_unary_rpc_method_handler(
+            servicer.ModelMetadata,
+            request_deserializer=grpc__predict__v2__pb2.ModelMetadataRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.ModelMetadataResponse.SerializeToString,
+        ),
+        "ModelInfer": inference_client.unary_unary_rpc_method_handler(
+            servicer.ModelInfer,
+            request_deserializer=grpc__predict__v2__pb2.ModelInferRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.ModelInferResponse.SerializeToString,
+        ),
+        "RepositoryModelLoad": inference_client.unary_unary_rpc_method_handler(
+            servicer.RepositoryModelLoad,
+            request_deserializer=grpc__predict__v2__pb2.RepositoryModelLoadRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.RepositoryModelLoadResponse.SerializeToString,
+        ),
+        "RepositoryModelUnload": inference_client.unary_unary_rpc_method_handler(
+            servicer.RepositoryModelUnload,
+            request_deserializer=grpc__predict__v2__pb2.RepositoryModelUnloadRequest.FromString,
+            response_serializer=grpc__predict__v2__pb2.RepositoryModelUnloadResponse.SerializeToString,
+        ),
+    }
+    generic_handler = inference_client.method_handlers_generic_handler(
+        "inference.GRPCInferenceService", rpc_method_handlers
+    )
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+# This class is part of an EXPERIMENTAL API.
+class GRPCInferenceService(object):
+    """Inference Server GRPC endpoints."""
+
+    @staticmethod
+    def ServerLive(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/ServerLive",
+            grpc__predict__v2__pb2.ServerLiveRequest.SerializeToString,
+            grpc__predict__v2__pb2.ServerLiveResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
+
+    @staticmethod
+    def ServerReady(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/ServerReady",
+            grpc__predict__v2__pb2.ServerReadyRequest.SerializeToString,
+            grpc__predict__v2__pb2.ServerReadyResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
+
+    @staticmethod
+    def ModelReady(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/ModelReady",
+            grpc__predict__v2__pb2.ModelReadyRequest.SerializeToString,
+            grpc__predict__v2__pb2.ModelReadyResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
+
+    @staticmethod
+    def ServerMetadata(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/ServerMetadata",
+            grpc__predict__v2__pb2.ServerMetadataRequest.SerializeToString,
+            grpc__predict__v2__pb2.ServerMetadataResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
+
+    @staticmethod
+    def ModelMetadata(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/ModelMetadata",
+            grpc__predict__v2__pb2.ModelMetadataRequest.SerializeToString,
+            grpc__predict__v2__pb2.ModelMetadataResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
+
+    @staticmethod
+    def ModelInfer(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/ModelInfer",
+            grpc__predict__v2__pb2.ModelInferRequest.SerializeToString,
+            grpc__predict__v2__pb2.ModelInferResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
+
+    @staticmethod
+    def RepositoryModelLoad(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/RepositoryModelLoad",
+            grpc__predict__v2__pb2.RepositoryModelLoadRequest.SerializeToString,
+            grpc__predict__v2__pb2.RepositoryModelLoadResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
+
+    @staticmethod
+    def RepositoryModelUnload(
+        request,
+        target,
+        options=(),
+        channel_credentials=None,
+        call_credentials=None,
+        insecure=False,
+        compression=None,
+        wait_for_ready=None,
+        timeout=None,
+        metadata=None,
+    ):
+        return inference_client.experimental.unary_unary(
+            request,
+            target,
+            "/inference.GRPCInferenceService/RepositoryModelUnload",
+            grpc__predict__v2__pb2.RepositoryModelUnloadRequest.SerializeToString,
+            grpc__predict__v2__pb2.RepositoryModelUnloadResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+        )
diff --git a/python/hsml/client/istio/internal.py b/python/hsml/client/istio/internal.py
index 1033f941e..feab49d71 100644
--- a/python/hsml/client/istio/internal.py
+++ b/python/hsml/client/istio/internal.py
@@ -21,7 +21,7 @@
 
 from pathlib import Path
 
-from hsml.client import auth
+from hsml.client import auth, exceptions
 from hsml.client.istio import base as istio
 
 try:
@@ -198,3 +198,9 @@ def _get_cert_pw(self):
 
         with pwd_path.open() as f:
             return f.read()
+
+    def _get_serving_api_key(self):
+        """Retrieve serving API key from environment variable."""
+        if self.SERVING_API_KEY not in os.environ:
+            raise exceptions.InternalClientError("Serving API key not found")
+        return os.environ[self.SERVING_API_KEY]
diff --git a/python/hsml/client/istio/utils/__init__.py b/python/hsml/client/istio/utils/__init__.py
new file mode 100644
index 000000000..ff8055b9b
--- /dev/null
+++ b/python/hsml/client/istio/utils/__init__.py
@@ -0,0 +1,15 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
diff --git a/python/hsml/client/istio/utils/infer_type.py b/python/hsml/client/istio/utils/infer_type.py
new file mode 100644
index 000000000..7feabc20e
--- /dev/null
+++ b/python/hsml/client/istio/utils/infer_type.py
@@ -0,0 +1,811 @@
+# Copyright 2023 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This implementation has been borrowed from kserve/kserve repository
+# https://github.com/kserve/kserve/blob/release-0.11/python/kserve/kserve/protocol/infer_type.py
+
+from typing import Optional, List, Dict
+
+import struct
+import numpy
+import numpy as np
+import pandas as pd
+from hsml.client.istio.grpc.errors import InvalidInput
+from hsml.client.istio.grpc.proto.grpc_predict_v2_pb2 import (
+    ModelInferRequest,
+    InferTensorContents,
+    ModelInferResponse,
+)
+from hsml.client.istio.utils.numpy_codec import to_np_dtype, from_np_dtype
+
+GRPC_CONTENT_DATATYPE_MAPPINGS = {
+    "BOOL": "bool_contents",
+    "INT8": "int_contents",
+    "INT16": "int_contents",
+    "INT32": "int_contents",
+    "INT64": "int64_contents",
+    "UINT8": "uint_contents",
+    "UINT16": "uint_contents",
+    "UINT32": "uint_contents",
+    "UINT64": "uint64_contents",
+    "FP32": "fp32_contents",
+    "FP64": "fp64_contents",
+    "BYTES": "bytes_contents",
+}
+
+
+def raise_error(msg):
+    """
+    Raise error with the provided message
+    """
+    raise InferenceServerException(msg=msg) from None
+
+
+def serialize_byte_tensor(input_tensor):
+    """
+    Serializes a bytes tensor into a flat numpy array of length prepended
+    bytes. The numpy array should use dtype of np.object. For np.bytes,
+    numpy will remove trailing zeros at the end of byte sequence and because
+    of this it should be avoided.
+
+    Parameters
+    ----------
+    input_tensor : np.array
+        The bytes tensor to serialize.
+
+    Returns
+    -------
+    serialized_bytes_tensor : np.array
+        The 1-D numpy array of type uint8 containing the serialized bytes in row-major form.
+
+    Raises
+    ------
+    InferenceServerException
+        If unable to serialize the given tensor.
+    """
+
+    if input_tensor.size == 0:
+        return np.empty([0], dtype=np.object_)
+
+    # If the input is a tensor of string/bytes objects, then must flatten those into
+    # a 1-dimensional array containing the 4-byte byte size followed by the
+    # actual element bytes. All elements are concatenated together in row-major
+    # order.
+
+    if (input_tensor.dtype != np.object_) and (input_tensor.dtype.type != np.bytes_):
+        raise_error("cannot serialize bytes tensor: invalid datatype")
+
+    flattened_ls = []
+    # 'C' order is row-major.
+    for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"):
+        # If directly passing bytes to BYTES type,
+        # don't convert it to str as Python will encode the
+        # bytes which may distort the meaning
+        if input_tensor.dtype == np.object_:
+            if type(obj.item()) == bytes:
+                s = obj.item()
+            else:
+                s = str(obj.item()).encode("utf-8")
+        else:
+            s = obj.item()
+        flattened_ls.append(struct.pack("<I", len(s)))
+        flattened_ls.append(s)
+    flattened = b"".join(flattened_ls)
+    flattened_array = np.asarray(flattened, dtype=np.object_)
+    if not flattened_array.flags["C_CONTIGUOUS"]:
+        flattened_array = np.ascontiguousarray(flattened_array, dtype=np.object_)
+    return flattened_array
+
+
+class InferenceServerException(Exception):
+    """Exception indicating non-Success status.
+
+    Parameters
+    ----------
+    msg : str
+        A brief description of error
+
+    status : str
+        The error code
+
+    debug_details : str
+        The additional details on the error
+
+    """
+
+    def __init__(self, msg, status=None, debug_details=None):
+        self._msg = msg
+        self._status = status
+        self._debug_details = debug_details
+
+    def __str__(self):
+        msg = super().__str__() if self._msg is None else self._msg
+        if self._status is not None:
+            msg = "[" + self._status + "] " + msg
+        return msg
+
+    def message(self):
+        """Get the exception message.
+
+        Returns
+        -------
+        str
+            The message associated with this exception, or None if no message.
+
+        """
+        return self._msg
+
+    def status(self):
+        """Get the status of the exception.
+
+        Returns
+        -------
+        str
+            Returns the status of the exception
+
+        """
+        return self._status
+
+    def debug_details(self):
+        """Get the detailed information about the exception
+        for debugging purposes
+
+        Returns
+        -------
+        str
+            Returns the exception details
+
+        """
+        return self._debug_details
+
+
+class InferInput:
+    _name: str
+    _shape: List[int]
+    _datatype: str
+    _parameters: Dict
+
+    def __init__(self, name, shape, datatype, data=None, parameters=None):
+        """An object of InferInput class is used to describe
+        input tensor for an inference request.
+        Parameters
+        ----------
+        name : str
+            The name of input whose data will be described by this object
+        shape : list
+            The shape of the associated input.
+        datatype : str
+            The datatype of the associated input.
+        data : Union[List, InferTensorContents]
+            The data of the REST/gRPC input. When data is not set, raw_data is used for gRPC for numpy array bytes.
+        parameters : dict
+            The additional server-specific parameters.
+        """
+        if parameters is None:
+            parameters = {}
+        self._name = name
+        self._shape = shape
+        self._datatype = datatype
+        self._parameters = parameters
+        self._data = data
+        self._raw_data = None
+
+    @property
+    def name(self):
+        """Get the name of input associated with this object.
+        Returns
+        -------
+        str
+            The name of input
+        """
+        return self._name
+
+    @property
+    def datatype(self):
+        """Get the datatype of input associated with this object.
+        Returns
+        -------
+        str
+            The datatype of input
+        """
+        return self._datatype
+
+    @property
+    def data(self):
+        """Get the data of InferInput"""
+        return self._data
+
+    @property
+    def shape(self):
+        """Get the shape of input associated with this object.
+        Returns
+        -------
+        list
+            The shape of input
+        """
+        return self._shape
+
+    @property
+    def parameters(self):
+        """Get the parameters of input associated with this object.
+        Returns
+        -------
+        dict
+            The key, value pair of string and InferParameter
+        """
+        return self._parameters
+
+    def set_shape(self, shape):
+        """Set the shape of input.
+        Parameters
+        ----------
+        shape : list
+            The shape of the associated input.
+        """
+        self._shape = shape
+
+    def as_numpy(self) -> np.ndarray:
+        dtype = to_np_dtype(self.datatype)
+        if dtype is None:
+            raise InvalidInput("invalid datatype in the input")
+        if self._raw_data is not None:
+            np_array = np.frombuffer(self._raw_data, dtype=dtype)
+            return np_array.reshape(self._shape)
+        else:
+            np_array = np.array(self._data, dtype=dtype)
+            return np_array.reshape(self._shape)
+
+    def set_data_from_numpy(self, input_tensor, binary_data=True):
+        """Set the tensor data from the specified numpy array for
+        input associated with this object.
+        Parameters
+        ----------
+        input_tensor : numpy array
+            The tensor data in numpy array format
+        binary_data : bool
+            Indicates whether to set data for the input in binary format
+            or explicit tensor within JSON. The default value is True,
+            which means the data will be delivered as binary data in the
+            HTTP body after the JSON object.
+        Raises
+        ------
+        InferenceServerException
+            If failed to set data for the tensor.
+        """
+        if not isinstance(input_tensor, (np.ndarray,)):
+            raise_error("input_tensor must be a numpy array")
+
+        dtype = from_np_dtype(input_tensor.dtype)
+        if self._datatype != dtype:
+            raise_error(
+                "got unexpected datatype {} from numpy array, expected {}".format(
+                    dtype, self._datatype
+                )
+            )
+        valid_shape = True
+        if len(self._shape) != len(input_tensor.shape):
+            valid_shape = False
+        else:
+            for i in range(len(self._shape)):
+                if self._shape[i] != input_tensor.shape[i]:
+                    valid_shape = False
+        if not valid_shape:
+            raise_error(
+                "got unexpected numpy array shape [{}], expected [{}]".format(
+                    str(input_tensor.shape)[1:-1], str(self._shape)[1:-1]
+                )
+            )
+
+        if not binary_data:
+            self._parameters.pop("binary_data_size", None)
+            self._raw_data = None
+            if self._datatype == "BYTES":
+                self._data = []
+                try:
+                    if input_tensor.size > 0:
+                        for obj in np.nditer(
+                            input_tensor, flags=["refs_ok"], order="C"
+                        ):
+                            # We need to convert the object to string using utf-8,
+                            # if we want to use the binary_data=False. JSON requires
+                            # the input to be a UTF-8 string.
+                            if input_tensor.dtype == np.object_:
+                                if type(obj.item()) == bytes:
+                                    self._data.append(str(obj.item(), encoding="utf-8"))
+                                else:
+                                    self._data.append(str(obj.item()))
+                            else:
+                                self._data.append(str(obj.item(), encoding="utf-8"))
+                except UnicodeDecodeError:
+                    raise_error(
+                        f'Failed to encode "{obj.item()}" using UTF-8. Please use binary_data=True, if'
+                        " you want to pass a byte array."
+                    )
+            else:
+                self._data = [val.item() for val in input_tensor.flatten()]
+        else:
+            self._data = None
+            if self._datatype == "BYTES":
+                serialized_output = serialize_byte_tensor(input_tensor)
+                if serialized_output.size > 0:
+                    self._raw_data = serialized_output.item()
+                else:
+                    self._raw_data = b""
+            else:
+                self._raw_data = input_tensor.tobytes()
+            self._parameters["binary_data_size"] = len(self._raw_data)
+
+
+def get_content(datatype: str, data: InferTensorContents):
+    if datatype == "BOOL":
+        return list(data.bool_contents)
+    elif datatype in ["UINT8", "UINT16", "UINT32"]:
+        return list(data.uint_contents)
+    elif datatype == "UINT64":
+        return list(data.uint64_contents)
+    elif datatype in ["INT8", "INT16", "INT32"]:
+        return list(data.int_contents)
+    elif datatype == "INT64":
+        return list(data.int64_contents)
+    elif datatype == "FP32":
+        return list(data.fp32_contents)
+    elif datatype == "FP64":
+        return list(data.fp64_contents)
+    elif datatype == "BYTES":
+        return list(data.bytes_contents)
+    else:
+        raise InvalidInput("invalid content type")
+
+
+class InferRequest:
+    """InferenceRequest Model
+
+    $inference_request =
+    {
+      "id" : $string #optional,
+      "parameters" : $parameters #optional,
+      "inputs" : [ $request_input, ... ],
+      "outputs" : [ $request_output, ... ] #optional
+    }
+    """
+
+    id: Optional[str]
+    model_name: str
+    parameters: Optional[Dict]
+    inputs: List[InferInput]
+    from_grpc: bool
+
+    def __init__(
+        self,
+        model_name: str,
+        infer_inputs: List[InferInput],
+        request_id=None,
+        raw_inputs=None,
+        from_grpc=False,
+        parameters=None,
+    ):
+        if parameters is None:
+            parameters = {}
+        self.id = request_id
+        self.model_name = model_name
+        self.inputs = infer_inputs
+        self.parameters = parameters
+        self.from_grpc = from_grpc
+        if raw_inputs:
+            for i, raw_input in enumerate(raw_inputs):
+                self.inputs[i]._raw_data = raw_input
+
+    @classmethod
+    def from_grpc(cls, request: ModelInferRequest):
+        infer_inputs = [
+            InferInput(
+                name=input_tensor.name,
+                shape=list(input_tensor.shape),
+                datatype=input_tensor.datatype,
+                data=get_content(input_tensor.datatype, input_tensor.contents),
+                parameters=input_tensor.parameters,
+            )
+            for input_tensor in request.inputs
+        ]
+        return cls(
+            request_id=request.id,
+            model_name=request.model_name,
+            infer_inputs=infer_inputs,
+            raw_inputs=request.raw_input_contents,
+            from_grpc=True,
+            parameters=request.parameters,
+        )
+
+    def to_rest(self) -> Dict:
+        """Converts the InferRequest object to v2 REST InferenceRequest message"""
+        infer_inputs = []
+        for infer_input in self.inputs:
+            infer_input_dict = {
+                "name": infer_input.name,
+                "shape": infer_input.shape,
+                "datatype": infer_input.datatype,
+            }
+            if isinstance(infer_input.data, numpy.ndarray):
+                infer_input.set_data_from_numpy(infer_input.data, binary_data=False)
+                infer_input_dict["data"] = infer_input.data
+            else:
+                infer_input_dict["data"] = infer_input.data
+            infer_inputs.append(infer_input_dict)
+        return {"id": self.id, "inputs": infer_inputs}
+
+    def to_grpc(self) -> ModelInferRequest:
+        """Converts the InferRequest object to gRPC ModelInferRequest message"""
+        infer_inputs = []
+        raw_input_contents = []
+        for infer_input in self.inputs:
+            if isinstance(infer_input.data, numpy.ndarray):
+                infer_input.set_data_from_numpy(infer_input.data, binary_data=True)
+            infer_input_dict = {
+                "name": infer_input.name,
+                "shape": infer_input.shape,
+                "datatype": infer_input.datatype,
+            }
+            if infer_input._raw_data is not None:
+                raw_input_contents.append(infer_input._raw_data)
+            else:
+                if not isinstance(infer_input.data, List):
+                    raise InvalidInput("input data is not a List")
+                infer_input_dict["contents"] = {}
+                data_key = GRPC_CONTENT_DATATYPE_MAPPINGS.get(
+                    infer_input.datatype, None
+                )
+                if data_key is not None:
+                    infer_input._data = [
+                        bytes(val, "utf-8") if isinstance(val, str) else val
+                        for val in infer_input.data
+                    ]  # str to byte conversion for grpc proto
+                    infer_input_dict["contents"][data_key] = infer_input.data
+                else:
+                    raise InvalidInput("invalid input datatype")
+            infer_inputs.append(infer_input_dict)
+
+        return ModelInferRequest(
+            id=self.id,
+            model_name=self.model_name,
+            inputs=infer_inputs,
+            raw_input_contents=raw_input_contents,
+        )
+
+    def as_dataframe(self) -> pd.DataFrame:
+        """
+        Decode the tensor inputs as pandas dataframe
+        """
+        dfs = []
+        for input in self.inputs:
+            input_data = input.data
+            if input.datatype == "BYTES":
+                input_data = [
+                    str(val, "utf-8") if isinstance(val, bytes) else val
+                    for val in input.data
+                ]
+            dfs.append(pd.DataFrame(input_data, columns=[input.name]))
+        return pd.concat(dfs, axis=1)
+
+
+class InferOutput:
+    def __init__(self, name, shape, datatype, data=None, parameters=None):
+        """An object of InferOutput class is used to describe
+        input tensor for an inference request.
+        Parameters
+        ----------
+        name : str
+            The name of input whose data will be described by this object
+        shape : list
+            The shape of the associated input.
+        datatype : str
+            The datatype of the associated input.
+        data : Union[List, InferTensorContents]
+            The data of the REST/gRPC input. When data is not set, raw_data is used for gRPC for numpy array bytes.
+        parameters : dict
+            The additional server-specific parameters.
+        """
+        if parameters is None:
+            parameters = {}
+        self._name = name
+        self._shape = shape
+        self._datatype = datatype
+        self._parameters = parameters
+        self._data = data
+        self._raw_data = None
+
+    @property
+    def name(self):
+        """Get the name of input associated with this object.
+        Returns
+        -------
+        str
+            The name of input
+        """
+        return self._name
+
+    @property
+    def datatype(self):
+        """Get the datatype of input associated with this object.
+        Returns
+        -------
+        str
+            The datatype of input
+        """
+        return self._datatype
+
+    @property
+    def data(self):
+        """Get the data of InferOutput"""
+        return self._data
+
+    @property
+    def shape(self):
+        """Get the shape of input associated with this object.
+        Returns
+        -------
+        list
+            The shape of input
+        """
+        return self._shape
+
+    @property
+    def parameters(self):
+        """Get the parameters of input associated with this object.
+        Returns
+        -------
+        dict
+            The key, value pair of string and InferParameter
+        """
+        return self._parameters
+
+    def set_shape(self, shape):
+        """Set the shape of input.
+        Parameters
+        ----------
+        shape : list
+            The shape of the associated input.
+        """
+        self._shape = shape
+
+    def as_numpy(self) -> numpy.ndarray:
+        """
+        Decode the tensor data as numpy array
+        """
+        dtype = to_np_dtype(self.datatype)
+        if dtype is None:
+            raise InvalidInput("invalid datatype in the input")
+        if self._raw_data is not None:
+            np_array = np.frombuffer(self._raw_data, dtype=dtype)
+            return np_array.reshape(self._shape)
+        else:
+            np_array = np.array(self._data, dtype=dtype)
+            return np_array.reshape(self._shape)
+
+    def set_data_from_numpy(self, input_tensor, binary_data=True):
+        """Set the tensor data from the specified numpy array for
+        input associated with this object.
+        Parameters
+        ----------
+        input_tensor : numpy array
+            The tensor data in numpy array format
+        binary_data : bool
+            Indicates whether to set data for the input in binary format
+            or explicit tensor within JSON. The default value is True,
+            which means the data will be delivered as binary data in the
+            HTTP body after the JSON object.
+        Raises
+        ------
+        InferenceServerException
+            If failed to set data for the tensor.
+        """
+        if not isinstance(input_tensor, (np.ndarray,)):
+            raise_error("input_tensor must be a numpy array")
+
+        dtype = from_np_dtype(input_tensor.dtype)
+        if self._datatype != dtype:
+            raise_error(
+                "got unexpected datatype {} from numpy array, expected {}".format(
+                    dtype, self._datatype
+                )
+            )
+        valid_shape = True
+        if len(self._shape) != len(input_tensor.shape):
+            valid_shape = False
+        else:
+            for i in range(len(self._shape)):
+                if self._shape[i] != input_tensor.shape[i]:
+                    valid_shape = False
+        if not valid_shape:
+            raise_error(
+                "got unexpected numpy array shape [{}], expected [{}]".format(
+                    str(input_tensor.shape)[1:-1], str(self._shape)[1:-1]
+                )
+            )
+
+        if not binary_data:
+            self._parameters.pop("binary_data_size", None)
+            self._raw_data = None
+            if self._datatype == "BYTES":
+                self._data = []
+                try:
+                    if input_tensor.size > 0:
+                        for obj in np.nditer(
+                            input_tensor, flags=["refs_ok"], order="C"
+                        ):
+                            # We need to convert the object to string using utf-8,
+                            # if we want to use the binary_data=False. JSON requires
+                            # the input to be a UTF-8 string.
+                            if input_tensor.dtype == np.object_:
+                                if type(obj.item()) == bytes:
+                                    self._data.append(str(obj.item(), encoding="utf-8"))
+                                else:
+                                    self._data.append(str(obj.item()))
+                            else:
+                                self._data.append(str(obj.item(), encoding="utf-8"))
+                except UnicodeDecodeError:
+                    raise_error(
+                        f'Failed to encode "{obj.item()}" using UTF-8. Please use binary_data=True, if'
+                        " you want to pass a byte array."
+                    )
+            else:
+                self._data = [val.item() for val in input_tensor.flatten()]
+        else:
+            self._data = None
+            if self._datatype == "BYTES":
+                serialized_output = serialize_byte_tensor(input_tensor)
+                if serialized_output.size > 0:
+                    self._raw_data = serialized_output.item()
+                else:
+                    self._raw_data = b""
+            else:
+                self._raw_data = input_tensor.tobytes()
+            self._parameters["binary_data_size"] = len(self._raw_data)
+
+
+class InferResponse:
+    """InferenceResponse
+
+    $inference_response =
+    {
+      "model_name" : $string,
+      "model_version" : $string #optional,
+      "id" : $string,
+      "parameters" : $parameters #optional,
+      "outputs" : [ $response_output, ... ]
+    }
+    """
+
+    id: str
+    model_name: str
+    parameters: Optional[Dict]
+    outputs: List[InferOutput]
+    from_grpc: bool
+
+    def __init__(
+        self,
+        response_id: str,
+        model_name: str,
+        infer_outputs: List[InferOutput],
+        raw_outputs=None,
+        from_grpc=False,
+        parameters=None,
+    ):
+        if parameters is None:
+            parameters = {}
+        self.id = response_id
+        self.model_name = model_name
+        self.outputs = infer_outputs
+        self.parameters = parameters
+        self.from_grpc = from_grpc
+        if raw_outputs:
+            for i, raw_output in enumerate(raw_outputs):
+                self.outputs[i]._raw_data = raw_output
+
+    @classmethod
+    def from_grpc(cls, response: ModelInferResponse) -> "InferResponse":
+        infer_outputs = [
+            InferOutput(
+                name=output.name,
+                shape=list(output.shape),
+                datatype=output.datatype,
+                data=get_content(output.datatype, output.contents),
+                parameters=output.parameters,
+            )
+            for output in response.outputs
+        ]
+        return cls(
+            model_name=response.model_name,
+            response_id=response.id,
+            parameters=response.parameters,
+            infer_outputs=infer_outputs,
+            raw_outputs=response.raw_output_contents,
+            from_grpc=True,
+        )
+
+    @classmethod
+    def from_rest(cls, model_name: str, response: Dict) -> "InferResponse":
+        infer_outputs = [
+            InferOutput(
+                name=output["name"],
+                shape=list(output["shape"]),
+                datatype=output["datatype"],
+                data=output["data"],
+                parameters=output.get("parameters", {}),
+            )
+            for output in response["outputs"]
+        ]
+        return cls(
+            model_name=model_name,
+            response_id=response.get("id", None),
+            parameters=response.get("parameters", {}),
+            infer_outputs=infer_outputs,
+        )
+
+    def to_rest(self) -> Dict:
+        """Converts the InferResponse object to v2 REST InferenceRequest message"""
+        infer_outputs = []
+        for i, infer_output in enumerate(self.outputs):
+            infer_output_dict = {
+                "name": infer_output.name,
+                "shape": infer_output.shape,
+                "datatype": infer_output.datatype,
+            }
+            if isinstance(infer_output.data, numpy.ndarray):
+                infer_output.set_data_from_numpy(infer_output.data, binary_data=False)
+                infer_output_dict["data"] = infer_output.data
+            elif isinstance(infer_output._raw_data, bytes):
+                infer_output_dict["data"] = infer_output.as_numpy().tolist()
+            else:
+                infer_output_dict["data"] = infer_output.data
+            infer_outputs.append(infer_output_dict)
+        res = {"id": self.id, "model_name": self.model_name, "outputs": infer_outputs}
+        return res
+
+    def to_grpc(self) -> ModelInferResponse:
+        """Converts the InferResponse object to gRPC ModelInferRequest message"""
+        infer_outputs = []
+        raw_output_contents = []
+        for infer_output in self.outputs:
+            if isinstance(infer_output.data, numpy.ndarray):
+                infer_output.set_data_from_numpy(infer_output.data, binary_data=True)
+            infer_output_dict = {
+                "name": infer_output.name,
+                "shape": infer_output.shape,
+                "datatype": infer_output.datatype,
+            }
+            if infer_output._raw_data is not None:
+                raw_output_contents.append(infer_output._raw_data)
+            else:
+                if not isinstance(infer_output.data, List):
+                    raise InvalidInput("output data is not a List")
+                infer_output_dict["contents"] = {}
+                data_key = GRPC_CONTENT_DATATYPE_MAPPINGS.get(
+                    infer_output.datatype, None
+                )
+                if data_key is not None:
+                    infer_output._data = [
+                        bytes(val, "utf-8") if isinstance(val, str) else val
+                        for val in infer_output.data
+                    ]  # str to byte conversion for grpc proto
+                    infer_output_dict["contents"][data_key] = infer_output.data
+                else:
+                    raise InvalidInput("to_grpc: invalid output datatype")
+            infer_outputs.append(infer_output_dict)
+
+        return ModelInferResponse(
+            id=self.id,
+            model_name=self.model_name,
+            outputs=infer_outputs,
+            raw_output_contents=raw_output_contents,
+        )
diff --git a/python/hsml/client/istio/utils/numpy_codec.py b/python/hsml/client/istio/utils/numpy_codec.py
new file mode 100644
index 000000000..3c6ecb606
--- /dev/null
+++ b/python/hsml/client/istio/utils/numpy_codec.py
@@ -0,0 +1,67 @@
+# Copyright 2021 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This implementation has been borrowed from kserve/kserve repository
+# https://github.com/kserve/kserve/blob/release-0.11/python/kserve/kserve/utils/numpy_codec.py
+
+import numpy as np
+
+
+def to_np_dtype(dtype):
+    dtype_map = {
+        "BOOL": bool,
+        "INT8": np.int8,
+        "INT16": np.int16,
+        "INT32": np.int32,
+        "INT64": np.int64,
+        "UINT8": np.uint8,
+        "UINT16": np.uint16,
+        "UINT32": np.uint32,
+        "UINT64": np.uint64,
+        "FP16": np.float16,
+        "FP32": np.float32,
+        "FP64": np.float64,
+        "BYTES": np.object_,
+    }
+    return dtype_map.get(dtype, None)
+
+
+def from_np_dtype(np_dtype):
+    if np_dtype == bool:
+        return "BOOL"
+    elif np_dtype == np.int8:
+        return "INT8"
+    elif np_dtype == np.int16:
+        return "INT16"
+    elif np_dtype == np.int32:
+        return "INT32"
+    elif np_dtype == np.int64:
+        return "INT64"
+    elif np_dtype == np.uint8:
+        return "UINT8"
+    elif np_dtype == np.uint16:
+        return "UINT16"
+    elif np_dtype == np.uint32:
+        return "UINT32"
+    elif np_dtype == np.uint64:
+        return "UINT64"
+    elif np_dtype == np.float16:
+        return "FP16"
+    elif np_dtype == np.float32:
+        return "FP32"
+    elif np_dtype == np.float64:
+        return "FP64"
+    elif np_dtype == np.object_ or np_dtype.type == np.bytes_:
+        return "BYTES"
+    return None
diff --git a/python/hsml/constants.py b/python/hsml/constants.py
index 682c7884b..42dd7a412 100644
--- a/python/hsml/constants.py
+++ b/python/hsml/constants.py
@@ -108,6 +108,9 @@ class INFERENCE_ENDPOINTS:
     PORT_NAME_HTTPS = "HTTPS"
     PORT_NAME_STATUS_PORT = "STATUS"
     PORT_NAME_TLS = "TLS"
+    # protocol
+    API_PROTOCOL_REST = "REST"
+    API_PROTOCOL_GRPC = "GRPC"
 
 
 class DEPLOYABLE_COMPONENT:
diff --git a/python/hsml/core/serving_api.py b/python/hsml/core/serving_api.py
index 9d2bb8cd0..49b1cabb4 100644
--- a/python/hsml/core/serving_api.py
+++ b/python/hsml/core/serving_api.py
@@ -15,11 +15,17 @@
 #
 
 import json
+from typing import Union, Dict, List
 
 from hsml import client, deployment, predictor_state
 from hsml import inference_endpoint
 from hsml import deployable_component_logs
-from hsml.constants import ARTIFACT_VERSION
+from hsml.constants import ARTIFACT_VERSION, INFERENCE_ENDPOINTS as IE
+from hsml.client.istio.utils.infer_type import (
+    InferRequest,
+    InferInput,
+    InferOutput,
+)
 
 
 class ServingApi:
@@ -189,21 +195,37 @@ def reset_changes(self, deployment_instance):
     def send_inference_request(
         self,
         deployment_instance,
-        data: dict,
+        data: Union[Dict, List[InferInput]],
         through_hopsworks: bool = False,
-    ):
+    ) -> Union[Dict, List[InferOutput]]:
         """Send inference requests to a deployment with a certain id
 
         :param deployment_instance: metadata object of the deployment to be used for the prediction
         :type deployment_instance: Deployment
-        :param data: payload of the inference requests
-        :type data: dict
-        :param through_hopsworks: whether to send the inference request through the Hopsworks REST API
+        :param data: payload of the inference request
+        :type data: Union[Dict, List[InferInput]]
+        :param through_hopsworks: whether to send the inference request through the Hopsworks REST API or not
         :type through_hopsworks: bool
         :return: inference response
-        :rtype: dict
+        :rtype: Union[Dict, List[InferOutput]]
         """
+        if deployment_instance.api_protocol == IE.API_PROTOCOL_REST:
+            # REST protocol, use hopsworks or istio client
+            return self._send_inference_request_via_rest_protocol(
+                deployment_instance, data, through_hopsworks
+            )
+        else:
+            # gRPC protocol, use the deployment grpc channel
+            return self._send_inference_request_via_grpc_protocol(
+                deployment_instance, data
+            )
 
+    def _send_inference_request_via_rest_protocol(
+        self,
+        deployment_instance,
+        data: Dict,
+        through_hopsworks: bool = False,
+    ) -> Dict:
         headers = {"content-type": "application/json"}
         if through_hopsworks:
             # use Hopsworks client
@@ -228,10 +250,47 @@ def send_inference_request(
                 path_params = self._get_hopsworks_inference_path(
                     _client._project_id, deployment_instance
                 )
+
+        # send inference request
         return _client._send_request(
             "POST", path_params, headers=headers, data=json.dumps(data)
         )
 
+    def _send_inference_request_via_grpc_protocol(
+        self, deployment_instance, data: List[InferInput]
+    ) -> List[InferOutput]:
+        # get grpc channel
+        if deployment_instance._grpc_channel is None:
+            # The gRPC channel is lazily initialized. The first call to deployment.predict() will initialize
+            # the channel, which will be reused in all following calls on the same deployment object.
+            # The gRPC channel is freed when calling deployment.stop()
+            print("Initializing gRPC channel...")
+            deployment_instance._grpc_channel = self._create_grpc_channel(
+                deployment_instance.name
+            )
+        # build an infer request
+        request = InferRequest(
+            infer_inputs=data,
+            model_name=deployment_instance.name,
+        )
+
+        # send infer request
+        infer_response = deployment_instance._grpc_channel.infer(
+            infer_request=request, headers=None
+        )
+
+        # extract infer outputs
+        return infer_response.outputs
+
+    def _create_grpc_channel(self, deployment_name: str):
+        _client = client.get_istio_instance()
+        service_hostname = self._get_inference_request_host_header(
+            _client._project_name,
+            deployment_name,
+            client.get_knative_domain(),
+        )
+        return _client._create_grpc_channel(service_hostname)
+
     def is_kserve_installed(self):
         """Check if kserve is installed
 
diff --git a/python/hsml/deployment.py b/python/hsml/deployment.py
index eb68300ea..94d9f87b4 100644
--- a/python/hsml/deployment.py
+++ b/python/hsml/deployment.py
@@ -13,7 +13,7 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-from typing import Union, Optional
+from typing import Union, Dict, Optional, List
 
 from hsml import client, util
 from hsml import predictor as predictor_mod
@@ -28,6 +28,7 @@
 
 from hsml.client.exceptions import ModelServingException
 from hsml.constants import DEPLOYABLE_COMPONENT, PREDICTOR_STATE
+from hsml.client.istio.utils.infer_type import InferInput
 
 
 class Deployment:
@@ -60,6 +61,7 @@ def __init__(
 
         self._serving_api = serving_api.ServingApi()
         self._serving_engine = serving_engine.ServingEngine()
+        self._grpc_channel = None
 
     def save(self, await_update: Optional[int] = 60):
         """Persist this deployment including the predictor and metadata to Model Serving.
@@ -163,7 +165,11 @@ def is_stopped(self, or_created=True) -> bool:
             )
         )
 
-    def predict(self, data: dict = None, inputs: list = None):
+    def predict(
+        self,
+        data: Union[Dict, InferInput] = None,
+        inputs: Union[List, Dict] = None,
+    ):
         """Send inference requests to the deployment.
            One of data or inputs parameters must be set. If both are set, inputs will be ignored.
 
@@ -432,6 +438,14 @@ def creator(self):
         """Creator of the predictor."""
         return self._predictor.creator
 
+    @property
+    def api_protocol(self):
+        return self._predictor.api_protocol
+
+    @api_protocol.setter
+    def api_protocol(self, api_protocol: str):
+        self._predictor.api_protocol = api_protocol
+
     def __repr__(self):
         desc = (
             f", description: {self._description!r}"
diff --git a/python/hsml/engine/serving_engine.py b/python/hsml/engine/serving_engine.py
index f0d508bf5..8341bcec8 100644
--- a/python/hsml/engine/serving_engine.py
+++ b/python/hsml/engine/serving_engine.py
@@ -14,6 +14,8 @@
 #   limitations under the License.
 #
 
+from typing import Union, Dict, List
+
 import os
 import time
 import uuid
@@ -22,10 +24,17 @@
 
 from hsml import util
 
-from hsml.constants import DEPLOYMENT, PREDICTOR, PREDICTOR_STATE
+from hsml.constants import (
+    DEPLOYMENT,
+    PREDICTOR,
+    PREDICTOR_STATE,
+    INFERENCE_ENDPOINTS as IE,
+)
+
 from hsml.core import serving_api, dataset_api
 
 from hsml.client.exceptions import ModelServingException, RestAPIError
+from hsml.client.istio.utils.infer_type import InferInput
 
 
 class ServingEngine:
@@ -174,58 +183,8 @@ def update_progress(state, num_instances):
                 update_progress,
             )
 
-    def predict(self, deployment_instance, data, inputs):
-        payload = self._build_inference_payload(data, inputs)
-
-        serving_tool = deployment_instance.predictor.serving_tool
-        through_hopsworks = (
-            serving_tool != PREDICTOR.SERVING_TOOL_KSERVE
-        )  # if not KServe, send request to Hopsworks
-        try:
-            return self._serving_api.send_inference_request(
-                deployment_instance, payload, through_hopsworks
-            )
-        except RestAPIError as re:
-            if (
-                re.response.status_code == RestAPIError.STATUS_CODE_NOT_FOUND
-                or re.error_code
-                == ModelServingException.ERROR_CODE_DEPLOYMENT_NOT_RUNNING
-            ):
-                raise ModelServingException(
-                    "Deployment not created or running. If it is already created, start it by using `.start()` or check its status with .get_state()"
-                )
-
-            re.args = (
-                re.args[0] + "\n\n Check the model server logs by using `.get_logs()`",
-            )
-            raise re
-
-    def _build_inference_payload(self, data, inputs):
-        """Build or check the payload for an inference request. If the 'data' parameter is provided, this method ensures
-        it contains one of 'instances' or 'inputs' keys needed by the model server. Otherwise, if the 'inputs' parameter
-        is provided, this method builds the correct request payload using the 'instances' key.
-        While the 'inputs' key is only supported by default deployments, the 'instances' key is supported in all types of deployments.
-        """
-        if data is not None:  # check data
-            if not isinstance(data, dict):
-                raise ModelServingException(
-                    "Inference data must be a dictionary. Otherwise, use the inputs parameter."
-                )
-            if "instances" not in data and "inputs" not in data:
-                raise ModelServingException("Inference data is missing 'instances' key")
-        else:  # parse inputs
-            if not isinstance(inputs, list):
-                data = {"instances": [inputs]}  # wrap inputs in a list
-            else:
-                data = {"instances": inputs}  # use given inputs list by default
-                # check depth of the list: at least two levels are required for batch inference
-                # if the content is neither a list or dict, wrap it in an additional list
-                for i in inputs:
-                    if not isinstance(i, list) and not isinstance(i, dict):
-                        # if there are no two levels, wrap inputs in a list
-                        data = {"instances": [inputs]}
-                        break
-        return data
+        # free grpc channel
+        deployment_instance._grpc_channel = None
 
     def _check_status(self, deployment_instance, desired_status):
         state = deployment_instance.get_state()
@@ -490,7 +449,11 @@ def delete(self, deployment_instance, force=False):
         if state is None:
             return
 
-        if not force and state.status != PREDICTOR_STATE.STATUS_STOPPED:
+        if (
+            not force
+            and state.status != PREDICTOR_STATE.STATUS_STOPPED
+            and state.status != PREDICTOR_STATE.STATUS_CREATED
+        ):
             raise ModelServingException(
                 "Deployment not stopped, please stop it first by using `.stop()` or check its status with .get_state()"
             )
@@ -535,3 +498,208 @@ def get_logs(self, deployment_instance, component, tail):
         )
 
         return self._serving_api.get_logs(deployment_instance, component, tail)
+
+    # Model inference
+
+    def predict(
+        self,
+        deployment_instance,
+        data: Union[Dict, List[InferInput]],
+        inputs: Union[Dict, List[Dict]],
+    ):
+        # validate user-provided payload
+        self._validate_inference_payload(deployment_instance.api_protocol, data, inputs)
+
+        # build inference payload based on API protocol
+        payload = self._build_inference_payload(
+            deployment_instance.api_protocol, data, inputs
+        )
+
+        # if not KServe, send request through Hopsworks
+        serving_tool = deployment_instance.predictor.serving_tool
+        through_hopsworks = serving_tool != PREDICTOR.SERVING_TOOL_KSERVE
+        try:
+            return self._serving_api.send_inference_request(
+                deployment_instance, payload, through_hopsworks
+            )
+        except RestAPIError as re:
+            if (
+                re.response.status_code == RestAPIError.STATUS_CODE_NOT_FOUND
+                or re.error_code
+                == ModelServingException.ERROR_CODE_DEPLOYMENT_NOT_RUNNING
+            ):
+                raise ModelServingException(
+                    "Deployment not created or running. If it is already created, start it by using `.start()` or check its status with .get_state()"
+                )
+
+            re.args = (
+                re.args[0] + "\n\n Check the model server logs by using `.get_logs()`",
+            )
+            raise re
+
+    def _validate_inference_payload(
+        self,
+        api_protocol,
+        data: Union[Dict, List[InferInput]],
+        inputs: Union[Dict, List[Dict]],
+    ):
+        """Validates the user-provided inference payload. Either data or inputs parameter is expected, but both cannot be provided together."""
+        # check null inputs
+        if data is not None and inputs is not None:
+            raise ModelServingException(
+                "Inference data and inputs parameters cannot be provided together."
+            )
+        # check data or inputs
+        if data is not None:
+            self._validate_inference_data(api_protocol, data)
+        else:
+            self._validate_inference_inputs(api_protocol, inputs)
+
+    def _validate_inference_data(
+        self, api_protocol, data: Union[Dict, List[InferInput]]
+    ):
+        """Validates the inference payload when provided through the `data` parameter. The data parameter contains the raw payload to be sent
+        in the inference request and should have the corresponding type and format depending on the API protocol.
+        For the REST protocol, data should be a dictionary. For GRPC protocol, one or more InferInput objects is expected.
+        """
+        if api_protocol == IE.API_PROTOCOL_REST:  # REST protocol
+            if isinstance(data, Dict):
+                if "instances" not in data and "inputs" not in data:
+                    raise ModelServingException(
+                        "Inference data is missing 'instances' key."
+                    )
+
+                payload = data["instances"] if "instances" in data else data["inputs"]
+                if not isinstance(payload, List):
+                    raise ModelServingException(
+                        "Instances field should contain a 2-dim list."
+                    )
+                elif len(payload) == 0:
+                    raise ModelServingException(
+                        "Inference data cannot contain an empty list."
+                    )
+                elif not isinstance(payload[0], List):
+                    raise ModelServingException(
+                        "Instances field should contain a 2-dim list."
+                    )
+                elif len(payload[0]) == 0:
+                    raise ModelServingException(
+                        "Inference data cannot contain an empty list."
+                    )
+            else:  # not Dict
+                if isinstance(data, InferInput) or (
+                    isinstance(data, List) and isinstance(data[0], InferInput)
+                ):
+                    raise ModelServingException(
+                        "Inference data cannot contain `InferInput` for deployments with gRPC protocol disabled. Use a dictionary instead."
+                    )
+                raise ModelServingException(
+                    "Inference data must be a dictionary. Otherwise, use the `inputs` parameter."
+                )
+
+        else:  # gRPC protocol
+            if isinstance(data, Dict):
+                raise ModelServingException(
+                    "Inference data cannot be a dictionary for deployments with gRPC protocol enabled. "
+                    "Create a `InferInput` object or use the `inputs` parameter instead."
+                )
+            elif isinstance(data, List):
+                if len(data) == 0:
+                    raise ModelServingException(
+                        "Inference data cannot contain an empty list."
+                    )
+                if not isinstance(data[0], InferInput):
+                    raise ModelServingException(
+                        "Inference data must contain a list of `InferInput` objects. Otherwise, use the `inputs` parameter."
+                    )
+            else:
+                raise ModelServingException(
+                    "Inference data must contain a list of `InferInput` objects for deployments with gRPC protocol enabled."
+                )
+
+    def _validate_inference_inputs(
+        self, api_protocol, inputs: Union[Dict, List[Dict]], recursive_call=False
+    ):
+        """Validates the inference payload when provided through the `inputs` parameter. The inputs parameter contains only the payload values,
+        which will be parsed when building the request payload. It can be either a dictionary or a list.
+        """
+        if isinstance(inputs, List):
+            if len(inputs) == 0:
+                raise ModelServingException("Inference inputs cannot be an empty list.")
+            else:
+                self._validate_inference_inputs(
+                    api_protocol, inputs[0], recursive_call=True
+                )
+        elif isinstance(inputs, InferInput):
+            raise ModelServingException(
+                "Inference inputs cannot be of type `InferInput`. Use the `data` parameter instead."
+            )
+        elif isinstance(inputs, Dict):
+            required_keys = ("name", "shape", "datatype", "data")
+            if api_protocol == IE.API_PROTOCOL_GRPC and not all(
+                k in inputs for k in required_keys
+            ):
+                raise ModelServingException(
+                    f"Inference inputs is missing one or more keys. Required keys are [{', '.join(required_keys)}]."
+                )
+        elif not recursive_call or (api_protocol == IE.API_PROTOCOL_GRPC):
+            # if it is the first call to this method, inputs have an invalid type/format
+            # if GRPC protocol is used, only Dict type is valid for the input values
+            raise ModelServingException(
+                "Inference inputs type is not valid. Supported types are dictionary and list."
+            )
+
+    def _build_inference_payload(
+        self,
+        api_protocol,
+        data: Union[Dict, List[InferInput]],
+        inputs: Union[Dict, List[Dict]],
+    ):
+        """Build the inference payload for an inference request. If the 'data' parameter is provided, this method ensures
+        it has the correct format depending on the API protocol. Otherwise, if the 'inputs' parameter is provided, this method
+        builds the correct request payload depending on the API protocol.
+        """
+        if data is not None:
+            # data contains the raw payload (dict or InferInput), nothing needs to be changed
+            return data
+        else:  # parse inputs
+            return self._parse_inference_inputs(api_protocol, inputs)
+
+    def _parse_inference_inputs(
+        self, api_protocol, inputs: Union[Dict, List[Dict]], recursive_call=False
+    ):
+        if api_protocol == IE.API_PROTOCOL_REST:  # REST protocol
+            if not isinstance(inputs, List):
+                data = {"instances": [[inputs]]}  # wrap inputs in a 2-dim list
+            else:
+                data = {"instances": inputs}  # use given inputs list by default
+                # check depth of the list: at least two levels are required for batch inference
+                # if the content is neither a list or dict, wrap it in an additional list
+                for i in inputs:
+                    if not isinstance(i, List) and not isinstance(i, Dict):
+                        # if there are no two levels, wrap inputs in a list
+                        data = {"instances": [inputs]}
+                        break
+        else:  # gRPC protocol
+            if isinstance(inputs, Dict):  # Dict
+                data = InferInput(
+                    name=inputs["name"],
+                    shape=inputs["shape"],
+                    datatype=inputs["datatype"],
+                    data=inputs["data"],
+                    parameters=(
+                        inputs["parameters"] if "parameters" in inputs else None
+                    ),
+                )
+                if not recursive_call:
+                    # if inputs is of type Dict, return a singleton
+                    data = [data]
+
+            else:  # List[Dict]
+                data = inputs
+                for index, inputs_item in enumerate(inputs):
+                    data[index] = self._parse_inference_inputs(
+                        api_protocol, inputs_item, recursive_call=True
+                    )
+
+        return data
diff --git a/python/hsml/model.py b/python/hsml/model.py
index 32cda793b..a1974b471 100644
--- a/python/hsml/model.py
+++ b/python/hsml/model.py
@@ -20,7 +20,7 @@
 
 from hsml import client, util
 
-from hsml.constants import ARTIFACT_VERSION
+from hsml.constants import ARTIFACT_VERSION, INFERENCE_ENDPOINTS as IE
 from hsml.engine import model_engine
 from hsml.predictor import Predictor
 from hsml.resources import PredictorResources
@@ -136,6 +136,7 @@ def deploy(
         inference_logger: Optional[Union[InferenceLogger, dict]] = None,
         inference_batcher: Optional[Union[InferenceBatcher, dict]] = None,
         transformer: Optional[Union[Transformer, dict]] = None,
+        api_protocol: Optional[str] = IE.API_PROTOCOL_REST,
     ):
         """Deploy the model.
 
@@ -165,6 +166,7 @@ def deploy(
             inference_logger: Inference logger configuration.
             inference_batcher: Inference batcher configuration.
             transformer: Transformer to be deployed together with the predictor.
+            api_protocol: API protocol to be enabled in the deployment (i.e., 'REST' or 'GRPC'). Defaults to 'REST'.
 
         # Returns
             `Deployment`: The deployment metadata object of a new or existing deployment.
@@ -184,6 +186,7 @@ def deploy(
             inference_logger=inference_logger,
             inference_batcher=inference_batcher,
             transformer=transformer,
+            api_protocol=api_protocol,
         )
 
         return predictor.deploy()
diff --git a/python/hsml/model_serving.py b/python/hsml/model_serving.py
index 5851fa4f6..c5a3b2495 100644
--- a/python/hsml/model_serving.py
+++ b/python/hsml/model_serving.py
@@ -18,7 +18,7 @@
 
 from hsml import util
 
-from hsml.constants import ARTIFACT_VERSION, PREDICTOR_STATE
+from hsml.constants import ARTIFACT_VERSION, PREDICTOR_STATE, INFERENCE_ENDPOINTS as IE
 from hsml.core import serving_api
 from hsml.model import Model
 from hsml.predictor import Predictor
@@ -156,6 +156,7 @@ def create_predictor(
         inference_logger: Optional[Union[InferenceLogger, dict, str]] = None,
         inference_batcher: Optional[Union[InferenceBatcher, dict]] = None,
         transformer: Optional[Union[Transformer, dict]] = None,
+        api_protocol: Optional[str] = IE.API_PROTOCOL_REST,
     ):
         """Create a Predictor metadata object.
 
@@ -192,6 +193,7 @@ def create_predictor(
             inference_logger: Inference logger configuration.
             inference_batcher: Inference batcher configuration.
             transformer: Transformer to be deployed together with the predictor.
+            api_protocol: API protocol to be enabled in the deployment (i.e., 'REST' or 'GRPC'). Defaults to 'REST'.
 
         # Returns
             `Predictor`. The predictor metadata object.
@@ -210,6 +212,7 @@ def create_predictor(
             inference_logger=inference_logger,
             inference_batcher=inference_batcher,
             transformer=transformer,
+            api_protocol=api_protocol,
         )
 
     def create_transformer(
diff --git a/python/hsml/predictor.py b/python/hsml/predictor.py
index 0feabec61..a06962d72 100644
--- a/python/hsml/predictor.py
+++ b/python/hsml/predictor.py
@@ -21,7 +21,7 @@
 from hsml import deployment
 from hsml import client
 
-from hsml.constants import ARTIFACT_VERSION, PREDICTOR, MODEL
+from hsml.constants import ARTIFACT_VERSION, PREDICTOR, MODEL, INFERENCE_ENDPOINTS
 from hsml.transformer import Transformer
 from hsml.predictor_state import PredictorState
 from hsml.deployable_component import DeployableComponent
@@ -52,6 +52,7 @@ def __init__(
         description: Optional[str] = None,
         created_at: Optional[str] = None,
         creator: Optional[str] = None,
+        api_protocol: Optional[str] = INFERENCE_ENDPOINTS.API_PROTOCOL_REST,
         **kwargs,
     ):
         serving_tool = (
@@ -86,6 +87,7 @@ def __init__(
         )
         self._transformer = util.get_obj_from_json(transformer, Transformer)
         self._validate_script_file(self._model_framework, self._script_file)
+        self._api_protocol = api_protocol
 
     def deploy(self):
         """Create a deployment for this predictor and persists it in the Model Serving.
@@ -252,6 +254,7 @@ def extract_fields_from_json(cls, json_decamelized):
         kwargs["id"] = json_decamelized.pop("id")
         kwargs["created_at"] = json_decamelized.pop("created")
         kwargs["creator"] = json_decamelized.pop("creator")
+        kwargs["api_protocol"] = json_decamelized.pop("api_protocol")
         return kwargs
 
     def update_from_response_json(self, json_dict):
@@ -278,6 +281,7 @@ def to_dict(self):
             "modelServer": self._model_server,
             "servingTool": self._serving_tool,
             "predictor": self._script_file,
+            "apiProtocol": self._api_protocol,
         }
         if self._resources is not None:
             json = {**json, **self._resources.to_dict()}
@@ -431,6 +435,15 @@ def requested_instances(self):
             num_instances += self._transformer.resources.num_instances
         return num_instances
 
+    @property
+    def api_protocol(self):
+        """API protocol enabled in the predictor (e.g., HTTP or GRPC)."""
+        return self._api_protocol
+
+    @api_protocol.setter
+    def api_protocol(self, api_protocol):
+        self._api_protocol = api_protocol
+
     def __repr__(self):
         desc = (
             f", description: {self._description!r}"
diff --git a/python/hsml/util.py b/python/hsml/util.py
index aa5309b94..83631a135 100644
--- a/python/hsml/util.py
+++ b/python/hsml/util.py
@@ -261,7 +261,7 @@ def get_hostname_replaced_url(sub_path: str):
     :return: href url
     """
     href = urljoin(client.get_instance()._base_url, sub_path)
-    url_parsed = client.get_instance().replace_public_host(urlparse(href))
+    url_parsed = client.get_instance()._replace_public_host(urlparse(href))
     return url_parsed.geturl()
 
 
diff --git a/python/setup.py b/python/setup.py
index 3c1e5858b..6adba2f1d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -43,6 +43,8 @@ def read(fname):
         "pyjks",
         "mock",
         "tqdm",
+        "grpcio>=1.49.1,<2.0.0",  # ^1.49.1
+        "protobuf>=3.19.0,<4.0.0",  # ^3.19.0
     ],
     extras_require={
         "dev": ["pytest", "flake8", "black"],