model_config.proto

// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2018, TensorFlow Authors. All rights reserved.

syntax = "proto3";

package nvidia.inferenceserver;

//@@.. cpp:namespace:: nvidia::inferenceserver

//@@
//@@.. cpp:enum:: DataType
//@@
//@@   Data types supported for input and output tensors.
//@@
enum DataType {
  //@@  .. cpp:enumerator:: DataType::INVALID = 0
  TYPE_INVALID = 0;

  //@@  .. cpp:enumerator:: DataType::BOOL = 1
  TYPE_BOOL = 1;

  //@@  .. cpp:enumerator:: DataType::UINT8 = 2
  TYPE_UINT8 = 2;
  //@@  .. cpp:enumerator:: DataType::UINT16 = 3
  TYPE_UINT16 = 3;
  //@@  .. cpp:enumerator:: DataType::UINT32 = 4
  TYPE_UINT32 = 4;
  //@@  .. cpp:enumerator:: DataType::UINT64 = 5
  TYPE_UINT64 = 5;

  //@@  .. cpp:enumerator:: DataType::INT8 = 6
  TYPE_INT8 = 6;
  //@@  .. cpp:enumerator:: DataType::INT16 = 7
  TYPE_INT16 = 7;
  //@@  .. cpp:enumerator:: DataType::INT32 = 8
  TYPE_INT32 = 8;
  //@@  .. cpp:enumerator:: DataType::INT64 = 9
  TYPE_INT64 = 9;

  //@@  .. cpp:enumerator:: DataType::FP16 = 10
  TYPE_FP16 = 10;
  //@@  .. cpp:enumerator:: DataType::FP32 = 11
  TYPE_FP32 = 11;
  //@@  .. cpp:enumerator:: DataType::FP64 = 12
  TYPE_FP64 = 12;

  //@@  .. cpp:enumerator:: DataType::STRING = 13
  TYPE_STRING = 13;
}

//@@
//@@.. cpp:var:: message ModelInstanceGroup
//@@
//@@   A group of one or more instances of a model and resources made
//@@   available for those instances.
//@@
message ModelInstanceGroup
{
  //@@
  //@@  .. cpp:enum:: Kind
  //@@
  //@@     Kind of this instance group.
  //@@
  enum Kind {
    //@@    .. cpp:enumerator:: Kind::KIND_AUTO = 0
    //@@
    //@@       This instance group represents instances that can run on either
    //@@       CPU or GPU. If all GPUs listed in 'gpus' are available then
    //@@       instances will be created on GPU(s), otherwise instances will
    //@@       be created on CPU.
    //@@
    KIND_AUTO = 0;

    //@@    .. cpp:enumerator:: Kind::KIND_GPU = 1
    //@@
    //@@       This instance group represents instances that must run on the
    //@@       GPU.
    //@@
    KIND_GPU = 1;

    //@@    .. cpp:enumerator:: Kind::KIND_CPU = 2
    //@@
    //@@       This instance group represents instances that must run on the
    //@@       CPU.
    //@@
    KIND_CPU = 2;

    //@@    .. cpp:enumerator:: Kind::KIND_MODEL = 3
    //@@
    //@@       This instance group represents instances that should run on the
    //@@       CPU and/or GPU(s) as specified by the model or backend itself.
    //@@       The inference server will not override the model/backend
    //@@       settings.
    //@@       Currently, this option is supported only for Tensorflow models.
    //@@
    KIND_MODEL = 3;
  }

  //@@  .. cpp:var:: string name
  //@@
  //@@     Optional name of this group of instances. If not specified the
  //@@     name will be formed as <model name>_<group number>. The name of
  //@@     individual instances will be further formed by a unique instance
  //@@     number and GPU index:
  //@@
  string name = 1;

  //@@  .. cpp:var:: Kind kind
  //@@
  //@@     The kind of this instance group. Default is KIND_AUTO. If
  //@@     KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and
  //@@     may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid
  //@@     and 'gpu' cannot be specified.
  //@@
  Kind kind = 4;

  //@@  .. cpp:var:: int32 count
  //@@
  //@@     For a group assigned to GPU, the number of instances created for
  //@@     each GPU listed in 'gpus'. For a group assigned to CPU the number
  //@@     of instances created. Default is 1.
  int32 count = 2;

  //@@  .. cpp:var:: int32 gpus (repeated)
  //@@
  //@@     GPU(s) where instances should be available. For each GPU listed,
  //@@     'count' instances of the model will be available. Setting 'gpus'
  //@@     to empty (or not specifying at all) is eqivalent to listing all
  //@@     available GPUs.
  //@@
  repeated int32 gpus = 3;

  //@@  .. cpp:var:: string profile (repeated)
  //@@
  //@@     For TensorRT models, using inputs with dynamic shape, this
  //@@     parameter specifies a set of optimization profiles available to this
  //@@     instance group. The inference server will choose the optimal profile
  //@@     based on the shapes of the input tensors. This field should lie
  //@@     between 0 and <TotalNumberOfOptimizationProfilesInPlanModel> - 1
  //@@     and be specified only for TensorRT backend, otherwise an error will
  //@@     be generated.
  //@@
  repeated string profile = 5;
}

//@@
//@@.. cpp:var:: message ModelTensorReshape
//@@
//@@   Reshape specification for input and output tensors.
//@@
message ModelTensorReshape
{
  //@@  .. cpp:var:: int64 shape (repeated)
  //@@
  //@@     The shape to use for reshaping.
  //@@
  repeated int64 shape = 1;
}

//@@
//@@.. cpp:var:: message ModelInput
//@@
//@@   An input required by the model.
//@@
message ModelInput
{
  //@@
  //@@  .. cpp:enum:: Format
  //@@
  //@@     The format for the input.
  //@@
  enum Format {
    //@@    .. cpp:enumerator:: Format::FORMAT_NONE = 0
    //@@
    //@@       The input has no specific format. This is the default.
    //@@
    FORMAT_NONE = 0;

    //@@    .. cpp:enumerator:: Format::FORMAT_NHWC = 1
    //@@
    //@@       HWC image format. Tensors with this format require 3 dimensions
    //@@       if the model does not support batching (max_batch_size = 0) or 4
    //@@       dimensions if the model does support batching (max_batch_size
    //@@       >= 1). In either case the 'dims' below should only specify the
    //@@       3 non-batch dimensions (i.e. HWC or CHW).
    //@@
    FORMAT_NHWC = 1;

    //@@    .. cpp:enumerator:: Format::FORMAT_NCHW = 2
    //@@
    //@@       CHW image format. Tensors with this format require 3 dimensions
    //@@       if the model does not support batching (max_batch_size = 0) or 4
    //@@       dimensions if the model does support batching (max_batch_size
    //@@       >= 1). In either case the 'dims' below should only specify the
    //@@       3 non-batch dimensions (i.e. HWC or CHW).
    //@@
    FORMAT_NCHW = 2;
  }

  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the input.
  //@@
  string name = 1;

  //@@  .. cpp:var:: DataType data_type
  //@@
  //@@     The data-type of the input.
  //@@
  DataType data_type = 2;

  //@@  .. cpp:var:: Format format
  //@@
  //@@     The format of the input. Optional.
  //@@
  Format format = 3;

  //@@  .. cpp:var:: int64 dims (repeated)
  //@@
  //@@     The dimensions/shape of the input tensor that must be provided
  //@@     when invoking the inference API for this model.
  //@@
  repeated int64 dims = 4;

  //@@  .. cpp:var:: ModelTensorReshape reshape
  //@@
  //@@     The shape expected for this input by the backend. The input will
  //@@     be reshaped to this before being presented to the backend. The
  //@@     reshape must have the same number of elements as the input shape
  //@@     specified by 'dims'. Optional.
  //@@
  ModelTensorReshape reshape = 5;

  //@@  .. cpp:var:: bool is_shape_tensor
  //@@
  //@@     Whether or not the input is a shape tensor to the model. This field
  //@@     is currently supported only for the TensorRT model. An error will be
  //@@     generated if this specification does not comply with underlying
  //@@     model.
  //@@
  bool is_shape_tensor = 6;

  //@@  .. cpp:var:: bool allow_ragged_batch
  //@@
  //@@     Whether or not the input is allowed to be "ragged" in a dynamically
  //@@     created batch. Default is false indicating that two requests will
  //@@     only be batched if this tensor has the same shape in both requests.
  //@@     True indicates that two requests can be batched even if this tensor
  //@@     has a different shape in each request. A true value is currently
  //@@     supported only for custom models.
  //@@
  bool allow_ragged_batch = 7;
}

//@@
//@@.. cpp:var:: message ModelOutput
//@@
//@@   An output produced by the model.
//@@
message ModelOutput
{
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the output.
  //@@
  string name = 1;

  //@@  .. cpp:var:: DataType data_type
  //@@
  //@@     The data-type of the output.
  //@@
  DataType data_type = 2;

  //@@  .. cpp:var:: int64 dims (repeated)
  //@@
  //@@     The dimensions/shape of the output tensor.
  //@@
  repeated int64 dims = 3;

  //@@  .. cpp:var:: ModelTensorReshape reshape
  //@@
  //@@     The shape produced for this output by the backend. The output will
  //@@     be reshaped from this to the shape specifed in 'dims' before being
  //@@     returned in the inference response. The reshape must have the same
  //@@     number of elements as the output shape specified by 'dims'. Optional.
  //@@
  ModelTensorReshape reshape = 5;

  //@@  .. cpp:var:: string label_filename
  //@@
  //@@     The label file associated with this output. Should be specified only
  //@@     for outputs that represent classifications. Optional.
  //@@
  string label_filename = 4;


  //@@  .. cpp:var:: bool is_shape_tensor
  //@@
  //@@     Whether or not the output is a shape tensor to the model. This field
  //@@     is currently supported only for the TensorRT model. An error will be
  //@@     generated if this specification does not comply with underlying
  //@@     model.
  //@@
  bool is_shape_tensor = 6;
}

//@@
//@@.. cpp:var:: message ModelVersionPolicy
//@@
//@@   Policy indicating which versions of a model should be made
//@@   available by the inference server.
//@@
message ModelVersionPolicy
{
  //@@  .. cpp:var:: message Latest
  //@@
  //@@     Serve only the latest version(s) of a model. This is
  //@@     the default policy.
  //@@
  message Latest
  {
    //@@    .. cpp:var:: uint32 num_versions
    //@@
    //@@       Serve only the 'num_versions' highest-numbered versions. T
    //@@       The default value of 'num_versions' is 1, indicating that by
    //@@       default only the single highest-number version of a
    //@@       model will be served.
    //@@
    uint32 num_versions = 1;
  }

  //@@  .. cpp:var:: message All
  //@@
  //@@     Serve all versions of the model.
  //@@
  message All {}

  //@@  .. cpp:var:: message Specific
  //@@
  //@@     Serve only specific versions of the model.
  //@@
  message Specific
  {
    //@@    .. cpp:var:: int64 versions (repeated)
    //@@
    //@@       The specific versions of the model that will be served.
    //@@
    repeated int64 versions = 1;
  }

  //@@  .. cpp:var:: oneof policy_choice
  //@@
  //@@     Each model must implement only a single version policy. The
  //@@     default policy is 'Latest'.
  //@@
  oneof policy_choice
  {
    //@@    .. cpp:var:: Latest latest
    //@@
    //@@       Serve only latest version(s) of the model.
    //@@
    Latest latest = 1;

    //@@    .. cpp:var:: All all
    //@@
    //@@       Serve all versions of the model.
    //@@
    All all = 2;

    //@@    .. cpp:var:: Specific specific
    //@@
    //@@       Serve only specific version(s) of the model.
    //@@
    Specific specific = 3;
  }
}

//@@
//@@.. cpp:var:: message ModelOptimizationPolicy
//@@
//@@   Optimization settings for a model. These settings control if/how a
//@@   model is optimized and prioritized by the backend framework when
//@@   it is loaded.
//@@
message ModelOptimizationPolicy
{
  //@@
  //@@  .. cpp:var:: message Graph
  //@@
  //@@     Enable generic graph optimization of the model. If not specified
  //@@     the framework's default level of optimization is used. Currently
  //@@     only supported for TensorFlow graphdef and savedmodel models and
  //@@     causes XLA to be enabled/disabled for the model.
  //@@
  message Graph
  {
    //@@    .. cpp:var:: int32 level
    //@@
    //@@       The optimization level. Defaults to 0 (zero) if not specified.
    //@@
    //@@         - -1: Disabled
    //@@         -  0: Framework default
    //@@         -  1+: Enable optimization level (greater values indicate
    //@@            higher optimization levels)
    //@@
    int32 level = 1;
  }

  //@@
  //@@  .. cpp:enum:: ModelPriority
  //@@
  //@@     Model priorities. A model will be given scheduling and execution
  //@@     preference over models at lower priorities. Current model
  //@@     priorities only work for TensorRT models.
  //@@
  enum ModelPriority {
    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0
    //@@
    //@@       The default model priority.
    //@@
    PRIORITY_DEFAULT = 0;

    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1
    //@@
    //@@       The maximum model priority.
    //@@
    PRIORITY_MAX = 1;

    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2
    //@@
    //@@       The minimum model priority.
    //@@
    PRIORITY_MIN = 2;
  }

  //@@
  //@@  .. cpp:var:: message Cuda
  //@@
  //@@     CUDA-specific optimization settings.
  //@@
  message Cuda
  {
    //@@    .. cpp:var:: bool graphs
    //@@
    //@@       Use CUDA graphs API to capture model operations and execute
    //@@       them more efficiently. Currently only recognized by TensorRT
    //@@       backend.
    //@@
    bool graphs = 1;
  }

  //@@
  //@@  .. cpp:var:: message ExecutionAccelerators
  //@@
  //@@     Specify the preferred execution accelerators to be used to execute
  //@@     the model. Currently only recognized by ONNX Runtime backend and
  //@@     TensorFlow backend.
  //@@
  //@@     For ONNX Runtime backend, it will deploy the model with the execution
  //@@     accelerators by priority, the priority is determined based on the
  //@@     order that they are set, i.e. the provider at the front has highest
  //@@     priority. Overall, the priority will be in the following order:
  //@@         <gpu_execution_accelerator> (if instance is on GPU)
  //@@         CUDA Execution Provider     (if instance is on GPU)
  //@@         <cpu_execution_accelerator>
  //@@         Default CPU Execution Provider
  //@@
  message ExecutionAccelerators
  {
    //@@
    //@@  .. cpp:var:: message Accelerator
    //@@
    //@@     Specify the accelerator to be used to execute the model.
    //@@     Accelerator with the same name may accept different parameters
    //@@     depending on the backends.
    //@@
    message Accelerator
    {
      //@@    .. cpp:var:: string name
      //@@
      //@@       The name of the execution accelerator.
      //@@
      string name = 1;

      //@@    .. cpp:var:: map<string, string> parameters
      //@@
      //@@       Additional paremeters used to configure the accelerator.
      //@@
      map<string, string> parameters = 2;
    }

    //@@    .. cpp:var:: Accelerator gpu_execution_accelerator (repeated)
    //@@
    //@@       The preferred execution provider to be used if the model instance
    //@@       is deployed on GPU.
    //@@
    //@@       For ONNX Runtime backend, possible value is "tensorrt" as name,
    //@@       and no parameters are required.
    //@@
    //@@       For TensorFlow backend, possible values are "tensorrt", "gpu_io".
    //@@
    //@@       For "tensorrt", the following parameters can be specified:
    //@@         "precision_mode": The precision used for optimization.
    //@@         Allowed values are "FP32" and "FP16". Default value is "FP32".
    //@@
    //@@         "max_cached_engines": The maximum number of cached TensorRT
    //@@         engines in dynamic TensorRT ops. Default value is 100.
    //@@
    //@@         "minimum_segment_size": The smallest model subgraph that will
    //@@         be considered for optimization by TensorRT. Default value is 3.
    //@@
    //@@         "max_workspace_size_bytes": The maximum GPU memory the model
    //@@         can use temporarily during execution. Default value is 1GB.
    //@@
    //@@       For "gpu_io", no parameters are required. If set, the model will
    //@@       be executed using TensorFlow Callable API to set input and output
    //@@       tensors in GPU memory if possible, which can reduce data transfer
    //@@       overhead if the model is used in ensemble. However, the Callable
    //@@       object will be created on model creation and it will request all
    //@@       outputs for every model execution, which may impact the
    //@@       performance if a request does not require all outputs. This
    //@@       optimization will only take affect if the model instance is
    //@@       created with KIND_GPU.
    //@@
    repeated Accelerator gpu_execution_accelerator = 1;

    //@@    .. cpp:var:: Accelerator cpu_execution_accelerator (repeated)
    //@@
    //@@       The preferred execution provider to be used if the model instance
    //@@       is deployed on CPU.
    //@@
    //@@       For ONNX Runtime backend, possible value is "openvino" as name,
    //@@       and no parameters are required.
    //@@
    repeated Accelerator cpu_execution_accelerator = 2;
  }

  //@@
  //@@  .. cpp:var:: message PinnedMemoryBuffer
  //@@
  //@@     Specify whether to use a pinned memory buffer when transferring data
  //@@     between non-pinned system memory and GPU memory. Using a pinned
  //@@     memory buffer for system from/to GPU transfers will typically provide
  //@@     increased performance. For example, in the common use case where the
  //@@     request provides inputs and delivers outputs via non-pinned system
  //@@     memory, if the model instance accepts GPU IOs, the inputs will be
  //@@     processed by two copies: from non-pinned system memory to pinned
  //@@     memory, and from pinned memory to GPU memory. Similarly, pinned
  //@@     memory will be used for delivering the outputs.
  //@@
  message PinnedMemoryBuffer
  {
    //@@    .. cpp:var:: bool enable
    //@@
    //@@       Use pinned memory buffer. Default is true.
    //@@
    bool enable = 1;
  }

  //@@  .. cpp:var:: Graph graph
  //@@
  //@@     The graph optimization setting for the model. Optional.
  //@@
  Graph graph = 1;

  //@@  .. cpp:var:: ModelPriority priority
  //@@
  //@@     The priority setting for the model. Optional.
  //@@
  ModelPriority priority = 2;

  //@@  .. cpp:var:: Cuda cuda
  //@@
  //@@     CUDA-specific optimization settings. Optional.
  //@@
  Cuda cuda = 3;

  //@@  .. cpp:var:: ExecutionAccelerators execution_accelerators
  //@@
  //@@     The accelerators used for the model. Optional.
  //@@
  ExecutionAccelerators execution_accelerators = 4;

  //@@  .. cpp:var:: PinnedMemoryBuffer input_pinned_memory
  //@@
  //@@     Use pinned memory buffer when the data transfer for inputs
  //@@     is between GPU memory and non-pinned system memory.
  //@@     Default is true.
  //@@
  PinnedMemoryBuffer input_pinned_memory = 5;

  //@@  .. cpp:var:: PinnedMemoryBuffer output_pinned_memory
  //@@
  //@@     Use pinned memory buffer when the data transfer for outputs
  //@@     is between GPU memory and non-pinned system memory.
  //@@     Default is true.
  //@@
  PinnedMemoryBuffer output_pinned_memory = 6;
}

//@@
//@@.. cpp:var:: message ModelDynamicBatching
//@@
//@@   Dynamic batching configuration. These settings control how dynamic
//@@   batching operates for the model.
//@@
message ModelDynamicBatching
{
  //@@  .. cpp:var:: int32 preferred_batch_size (repeated)
  //@@
  //@@     Preferred batch sizes for dynamic batching. If a batch of one of
  //@@     these sizes can be formed it will be executed immediately.  If
  //@@     not specified a preferred batch size will be chosen automatically
  //@@     based on model and GPU characteristics.
  //@@
  repeated int32 preferred_batch_size = 1;

  //@@  .. cpp:var:: uint64 max_queue_delay_microseconds
  //@@
  //@@     The maximum time, in microseconds, a request will be delayed in
  //@@     the scheduling queue to wait for additional requests for
  //@@     batching. Default is 0.
  //@@
  uint64 max_queue_delay_microseconds = 2;

  //@@  .. cpp:var:: bool preserve_ordering
  //@@
  //@@     Should the dynamic batcher preserve the ordering of responses to
  //@@     match the order of requests received by the scheduler. Default is
  //@@     false. If true, the responses will be returned in the same order as
  //@@     the order of requests sent to the scheduler. If false, the responses
  //@@     may be returned in arbitrary order. This option is specifically
  //@@     needed when a sequence of related inference requests (i.e. inference
  //@@     requests with the same correlation ID) are sent to the dynamic
  //@@     batcher to ensure that the sequence responses are in the correct
  //@@     order.
  //@@
  bool preserve_ordering = 3;
}

//@@
//@@.. cpp:var:: message ModelSequenceBatching
//@@
//@@   Sequence batching configuration. These settings control how sequence
//@@   batching operates for the model.
//@@
message ModelSequenceBatching
{
  //@@  .. cpp:var:: message Control
  //@@
  //@@     A control is a signal that the sequence batcher uses to
  //@@     communicate with a backend.
  //@@
  message Control
  {
    //@@
    //@@    .. cpp:enum:: Kind
    //@@
    //@@       The kind of the control.
    //@@
    enum Kind {
      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0
      //@@
      //@@         A new sequence is/is-not starting. If true a sequence is
      //@@         starting, if false a sequence is continuing. Must
      //@@         specify either int32_false_true or fp32_false_true for
      //@@         this control. This control is optional.
      //@@
      CONTROL_SEQUENCE_START = 0;

      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1
      //@@
      //@@         A sequence is/is-not ready for inference. If true the
      //@@         input tensor data is valid and should be used. If false
      //@@         the input tensor data is invalid and inferencing should
      //@@         be "skipped".  Must specify either int32_false_true or
      //@@         fp32_false_true for this control. This control is optional.
      //@@
      CONTROL_SEQUENCE_READY = 1;

      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2
      //@@
      //@@         A sequence is/is-not ending. If true a sequence is
      //@@         ending, if false a sequence is continuing. Must
      //@@         specify either int32_false_true or fp32_false_true for
      //@@         this control. This control is optional.
      //@@
      CONTROL_SEQUENCE_END = 2;

      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3
      //@@
      //@@         The correlation ID of the sequence. The correlation ID
      //@@         is an uint64_t value that is communicated in whole or
      //@@         in part by the tensor. The tensor's datatype must be
      //@@         specified by data_type and must be TYPE_UINT64, TYPE_INT64,
      //@@         TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified
      //@@         the correlation ID will be truncated to the low-order 32
      //@@         bits. This control is optional.
      //@@
      CONTROL_SEQUENCE_CORRID = 3;
    }

    //@@    .. cpp:var:: Kind kind
    //@@
    //@@       The kind of this control.
    //@@
    Kind kind = 1;

    //@@    .. cpp:var:: int32 int32_false_true (repeated)
    //@@
    //@@       The control's true and false setting is indicated by setting
    //@@       a value in an int32 tensor. The tensor must be a
    //@@       1-dimensional tensor with size equal to the batch size of
    //@@       the request. 'int32_false_true' must have two entries: the
    //@@       first the false value and the second the true value.
    //@@
    repeated int32 int32_false_true = 2;

    //@@    .. cpp:var:: float fp32_false_true (repeated)
    //@@
    //@@       The control's true and false setting is indicated by setting
    //@@       a value in a fp32 tensor. The tensor must be a
    //@@       1-dimensional tensor with size equal to the batch size of
    //@@       the request. 'fp32_false_true' must have two entries: the
    //@@       first the false value and the second the true value.
    //@@
    repeated float fp32_false_true = 3;

    //@@    .. cpp:var:: DataType data_type
    //@@
    //@@       The control's datatype.
    //@@
    DataType data_type = 4;
  }

  //@@  .. cpp:var:: message ControlInput
  //@@
  //@@     The sequence control values to communicate by a model input.
  //@@
  message ControlInput
  {
    //@@    .. cpp:var:: string name
    //@@
    //@@       The name of the model input.
    //@@
    string name = 1;

    //@@    .. cpp:var:: Control control (repeated)
    //@@
    //@@       The control value(s) that should be communicated to the
    //@@       model using this model input.
    //@@
    repeated Control control = 2;
  }

  //@@  .. cpp:var:: message StrategyDirect
  //@@
  //@@     The sequence batcher uses a specific, unique batch
  //@@     slot for each sequence. All inference requests in a
  //@@     sequence are directed to the same batch slot in the same
  //@@     model instance over the lifetime of the sequence. This
  //@@     is the default strategy.
  //@@
  message StrategyDirect {}

  //@@  .. cpp:var:: message StrategyOldest
  //@@
  //@@     The sequence batcher maintains up to 'max_candidate_sequences'
  //@@     candidate sequences. 'max_candidate_sequences' can be greater
  //@@     than the model's 'max_batch_size'. For inferencing the batcher
  //@@     chooses from the candidate sequences up to 'max_batch_size'
  //@@     inference requests. Requests are chosen in an oldest-first
  //@@     manner across all candidate sequences. A given sequence is
  //@@     not guaranteed to be assigned to the same batch slot for
  //@@     all inference requests of that sequence.
  //@@
  message StrategyOldest
  {
    //@@    .. cpp:var:: int32 max_candidate_sequences
    //@@
    //@@       Maximum number of candidate sequences that the batcher
    //@@       maintains. Excess seqences are kept in an ordered backlog
    //@@       and become candidates when existing candidate sequences
    //@@       complete.
    //@@
    int32 max_candidate_sequences = 1;

    //@@    .. cpp:var:: int32 preferred_batch_size (repeated)
    //@@
    //@@       Preferred batch sizes for dynamic batching of candidate
    //@@       sequences. If a batch of one of these sizes can be formed
    //@@       it will be executed immediately.  If not specified a
    //@@       preferred batch size will be chosen automatically
    //@@       based on model and GPU characteristics.
    //@@
    repeated int32 preferred_batch_size = 2;

    //@@    .. cpp:var:: uint64 max_queue_delay_microseconds
    //@@
    //@@       The maximum time, in microseconds, a candidate request
    //@@       will be delayed in the dynamic batch scheduling queue to
    //@@       wait for additional requests for batching. Default is 0.
    //@@
    uint64 max_queue_delay_microseconds = 3;
  }

  //@@  .. cpp:var:: oneof strategy_choice
  //@@
  //@@     The strategy used by the sequence batcher. Default strategy
  //@@     is 'direct'.
  //@@
  oneof strategy_choice
  {
    //@@    .. cpp:var:: StrategyDirect direct
    //@@
    //@@       StrategyDirect scheduling strategy.
    //@@
    StrategyDirect direct = 3;

    //@@    .. cpp:var:: StrategyOldest oldest
    //@@
    //@@       StrategyOldest scheduling strategy.
    //@@
    StrategyOldest oldest = 4;
  }

  //@@  .. cpp:var:: uint64 max_sequence_idle_microseconds
  //@@
  //@@     The maximum time, in microseconds, that a sequence is allowed to
  //@@     be idle before it is aborted. The inference server considers a
  //@@     sequence idle when it does not have any inference request queued
  //@@     for the sequence. If this limit is exceeded, the inference server
  //@@     will free the sequence slot allocated by the sequence and make it
  //@@     available for another sequence. If not specified (or specified as
  //@@     zero) a default value of 1000000 (1 second) is used.
  //@@
  uint64 max_sequence_idle_microseconds = 1;

  //@@  .. cpp:var:: ControlInput control_input (repeated)
  //@@
  //@@     The model input(s) that the server should use to communicate
  //@@     sequence start, stop, ready and similar control values to the
  //@@     model.
  //@@
  repeated ControlInput control_input = 2;
}

//@@
//@@.. cpp:var:: message ModelEnsembling
//@@
//@@   Model ensembling configuration. These settings specify the models that
//@@   compose the ensemble and how data flows between the models.
//@@
message ModelEnsembling
{
  //@@  .. cpp:var:: message Step
  //@@
  //@@     Each step specifies a model included in the ensemble,
  //@@     maps ensemble tensor names to the model input tensors,
  //@@     and maps model output tensors to ensemble tensor names
  //@@
  message Step
  {
    //@@  .. cpp:var:: string model_name
    //@@
    //@@     The name of the model to execute for this step of the ensemble.
    //@@
    string model_name = 1;

    //@@  .. cpp:var:: int64 model_version
    //@@
    //@@     The version of the model to use for inference. If -1
    //@@     the latest/most-recent version of the model is used.
    //@@
    int64 model_version = 2;

    //@@  .. cpp:var:: map<string,string> input_map
    //@@
    //@@     Map from name of an input tensor on this step's model to ensemble
    //@@     tensor name. The ensemble tensor must have the same data type and
    //@@     shape as the model input. Each model input must be assigned to
    //@@     one ensemble tensor, but the same ensemble tensor can be assigned
    //@@     to multiple model inputs.
    //@@
    map<string, string> input_map = 3;

    //@@  .. cpp:var:: map<string,string> output_map
    //@@
    //@@     Map from name of an output tensor on this step's model to ensemble
    //@@     tensor name. The data type and shape of the ensemble tensor will
    //@@     be inferred from the model output. It is optional to assign all
    //@@     model outputs to ensemble tensors. One ensemble tensor name
    //@@     can appear in an output map only once.
    //@@
    map<string, string> output_map = 4;
  }

  //@@  .. cpp:var:: Step step (repeated)
  //@@
  //@@     The models and the input / output mappings used within the ensemble.
  //@@
  repeated Step step = 1;
}

//@@
//@@.. cpp:var:: message ModelParameter
//@@
//@@   A model parameter.
//@@
message ModelParameter
{
  //@@  .. cpp:var:: string string_value
  //@@
  //@@     The string value of the parameter.
  //@@
  string string_value = 1;
}

//@@
//@@.. cpp:var:: message ModelWarmup
//@@
//@@   Settings used to construct the request sample for model warmup.
//@@
message ModelWarmup
{
  //@@
  //@@  .. cpp:var:: message Input
  //@@
  //@@     Meta data associated with an input.
  //@@
  message Input
  {
    //@@    .. cpp:var:: DataType data_type
    //@@
    //@@       The data-type of the input.
    //@@
    DataType data_type = 1;

    //@@    .. cpp:var:: int64 dims (repeated)
    //@@
    //@@       The shape of the input tensor, not including the batch dimension.
    //@@
    repeated int64 dims = 2;

    //@@    .. cpp:var:: oneof input_data_type
    //@@
    //@@       Specify how the input data is generated. If the input has STRING
    //@@       data type and 'random_data' is set, the data generation will fall
    //@@       back to 'zero_data'.
    //@@
    oneof input_data_type
    {
      //@@
      //@@    .. cpp:var:: bool zero_data
      //@@
      //@@       The identifier for using zeros as input data. Note that the
      //@@       value of 'zero_data' will not be checked, instead, zero data
      //@@       will be used as long as the field is set.
      //@@
      bool zero_data = 3;

      //@@
      //@@    .. cpp:var:: bool random_data
      //@@
      //@@       The identifier for using random data as input data. Note that
      //@@       the value of 'random_data' will not be checked, instead,
      //@@       random data will be used as long as the field is set.
      //@@
      bool random_data = 4;

      //@@    .. cpp:var:: string input_data_file
      //@@
      //@@       The file whose content will be used as raw input data in
      //@@       row-major order. The file must be provided in a sub-directory
      //@@       'warmup' under the model directory.
      //@@
      string input_data_file = 5;
    }
  }

  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the request sample.
  //@@
  string name = 1;

  //@@  .. cpp:var:: uint32 batch_size
  //@@
  //@@     The batch size of the inference request. This must be >= 1. For
  //@@     models that don't support batching, batch_size must be 1. If
  //@@     batch_size > 1, the 'inputs' specified below will be duplicated to
  //@@     match the batch size requested.
  //@@
  uint32 batch_size = 2;

  //@@  .. cpp:var:: map<string, Input> inputs
  //@@
  //@@     The warmup meta data associated with every model input, including
  //@@     control tensors.
  //@@
  map<string, Input> inputs = 3;
}

//@@
//@@.. cpp:var:: message ModelConfig
//@@
//@@   A model configuration.
//@@
message ModelConfig
{
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string platform
  //@@
  //@@     The framework for the model. Possible values are
  //@@     "tensorrt_plan", "tensorflow_graphdef",
  //@@     "tensorflow_savedmodel", "caffe2_netdef",
  //@@     "onnxruntime_onnx", "pytorch_libtorch" and "custom".
  //@@
  string platform = 2;

  //@@  .. cpp:var:: ModelVersionPolicy version_policy
  //@@
  //@@     Policy indicating which version(s) of the model will be served.
  //@@
  ModelVersionPolicy version_policy = 3;

  //@@  .. cpp:var:: int32 max_batch_size
  //@@
  //@@     Maximum batch size allowed for inference. This can only decrease
  //@@     what is allowed by the model itself. A max_batch_size value of 0
  //@@     indicates that batching is not allowed for the model and the
  //@@     dimension/shape of the input and output tensors must exactly
  //@@     match what is specified in the input and output configuration. A
  //@@     max_batch_size value > 0 indicates that batching is allowed and
  //@@     so the model expects the input tensors to have an additional
  //@@     initial dimension for the batching that is not specified in the
  //@@     input (for example, if the model supports batched inputs of
  //@@     2-dimensional tensors then the model configuration will specify
  //@@     the input shape as [ X, Y ] but the model will expect the actual
  //@@     input tensors to have shape [ N, X, Y ]). For max_batch_size > 0
  //@@     returned outputs will also have an additional initial dimension
  //@@     for the batch.
  //@@
  int32 max_batch_size = 4;

  //@@  .. cpp:var:: ModelInput input (repeated)
  //@@
  //@@     The inputs request by the model.
  //@@
  repeated ModelInput input = 5;

  //@@  .. cpp:var:: ModelOutput output (repeated)
  //@@
  //@@     The outputs produced by the model.
  //@@
  repeated ModelOutput output = 6;

  //@@  .. cpp:var:: ModelOptimizationPolicy optimization
  //@@
  //@@     Optimization configuration for the model. If not specified
  //@@     then default optimization policy is used.
  //@@
  ModelOptimizationPolicy optimization = 12;

  //@@  .. cpp:var:: oneof scheduling_choice
  //@@
  //@@     The scheduling policy for the model. If not specified the
  //@@     default scheduling policy is used for the model. The default
  //@@     policy is to execute each inference request independently.
  //@@
  oneof scheduling_choice
  {
    //@@    .. cpp:var:: ModelDynamicBatching dynamic_batching
    //@@
    //@@       If specified, enables the dynamic-batching scheduling
    //@@       policy. With dynamic-batching the scheduler may group
    //@@       together independent requests into a single batch to
    //@@       improve inference throughput.
    //@@
    ModelDynamicBatching dynamic_batching = 11;

    //@@    .. cpp:var:: ModelSequenceBatching sequence_batching
    //@@
    //@@       If specified, enables the sequence-batching scheduling
    //@@       policy. With sequence-batching, inference requests
    //@@       with the same correlation ID are routed to the same
    //@@       model instance. Multiple sequences of inference requests
    //@@       may be batched together into a single batch to
    //@@       improve inference throughput.
    //@@
    ModelSequenceBatching sequence_batching = 13;

    //@@    .. cpp:var:: ModelEnsembling ensemble_scheduling
    //@@
    //@@       If specified, enables the model-ensembling scheduling
    //@@       policy. With model-ensembling, inference requests
    //@@       will be processed according to the specification, such as an
    //@@       execution sequence of models. The input specified in this model
    //@@       config will be the input for the ensemble, and the output
    //@@       specified will be the output of the ensemble.
    //@@
    ModelEnsembling ensemble_scheduling = 15;
  }

  //@@  .. cpp:var:: ModelInstanceGroup instance_group (repeated)
  //@@
  //@@     Instances of this model. If not specified, one instance
  //@@     of the model will be instantiated on each available GPU.
  //@@
  repeated ModelInstanceGroup instance_group = 7;

  //@@  .. cpp:var:: string default_model_filename
  //@@
  //@@     Optional filename of the model file to use if a
  //@@     compute-capability specific model is not specified in
  //@@     :cpp:var:`cc_model_filenames`. If not specified the default name
  //@@     is 'model.graphdef', 'model.savedmodel', 'model.plan' or
  //@@     'model.netdef' depending on the model type.
  //@@
  string default_model_filename = 8;

  //@@  .. cpp:var:: map<string,string> cc_model_filenames
  //@@
  //@@     Optional map from CUDA compute capability to the filename of
  //@@     the model that supports that compute capability. The filename
  //@@     refers to a file within the model version directory.
  //@@
  map<string, string> cc_model_filenames = 9;

  //@@  .. cpp:var:: map<string,string> metric_tags
  //@@
  //@@     Optional metric tags. User-specific key-value pairs for metrics
  //@@     reported for this model. These tags are applied to the metrics
  //@@     reported on the HTTP metrics port.
  //@@
  map<string, string> metric_tags = 10;

  //@@  .. cpp:var:: map<string,ModelParameter> parameters
  //@@
  //@@     Optional model parameters. User-specified parameter values that
  //@@     are made available to custom backends.
  //@@
  map<string, ModelParameter> parameters = 14;

  //@@  .. cpp:var:: ModelWarmup model_warmup (repeated)
  //@@
  //@@     Warmup setting of this model. If specified, all instances
  //@@     will be run with the request samples in sequence before
  //@@     serving the model.
  //@@     This field can only be specified if the model is not an ensemble
  //@@     model.
  //@@
  repeated ModelWarmup model_warmup = 16;
}