You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am observing significant performance differences between the Triton Client SDK (both Python and C++) and the perf_analyzer tool when testing with the same model configuration. I would like to understand the potential reasons for this performance gap and seek guidance on optimizing the SDK-based client implementations.
I have implemented clients using both the Python and C++ SDKs. The requests are configured to match the perf_analyzer setup as closely as possible. However, the throughput and latency are noticeably worse in comparison.
Techniques Tried: asynchronous requests.
Observation: Throughput is approximately 10% of perf_analyzer results. GPU usage remains below 25%.
C++ Client Code
#include<cuda_runtime_api.h>
#include<unistd.h>
#include<iostream>
#include<string>
#include"grpc_client.h"
#include"shm_utils.h"namespacetc= triton::client;
#defineFAIL_IF_ERR(X, MSG) \
{ \
tc::Error err = (X); \
if (!err.IsOk()) { \
std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
exit(1); \
} \
}
namespace {
voidValidateShapeAndDatatype(
const std::string& name, std::shared_ptr<tc::InferResult> result)
{
std::vector<int64_t> shape;
FAIL_IF_ERR(
result->Shape(name, &shape), "unable to get shape for '" + name + "'");
// Validate shapeif ((shape.size() != 2) || (shape[0] != 1) || (shape[1] != 16)) {
std::cerr << "error: received incorrect shapes for '" << name << "'"
<< std::endl;
exit(1);
}
std::string datatype;
FAIL_IF_ERR(
result->Datatype(name, &datatype),
"unable to get datatype for '" + name + "'");
// Validate datatypeif (datatype.compare("INT32") != 0) {
std::cerr << "error: received incorrect datatype for '" << name
<< "': " << datatype << std::endl;
exit(1);
}
}
voidUsage(char** argv, const std::string& msg = std::string())
{
if (!msg.empty()) {
std::cerr << "error: " << msg << std::endl;
}
std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
std::cerr << "\t-v" << std::endl;
std::cerr << "\t-u <URL for inference service>" << std::endl;
std::cerr << "\t-H <HTTP header>" << std::endl;
std::cerr << std::endl;
std::cerr
<< "For -H, header must be 'Header:Value'. May be given multiple times."
<< std::endl;
exit(1);
}
} // namespace
#defineFAIL_IF_CUDA_ERR(FUNC) \
{ \
const cudaError_t result = FUNC; \
if (result != cudaSuccess) { \
std::cerr << "CUDA exception (line " << __LINE__ \
<< "): " << cudaGetErrorName(result) << " (" \
<< cudaGetErrorString(result) << ")" << std::endl; \
exit(1); \
} \
}
voidCreateCUDAIPCHandle(
cudaIpcMemHandle_t* cuda_handle, void* input_d_ptr, int device_id = 0)
{
// Set the GPU device to the desired GPUFAIL_IF_CUDA_ERR(cudaSetDevice(device_id));
// Create IPC handle for data on the gpuFAIL_IF_CUDA_ERR(cudaIpcGetMemHandle(cuda_handle, input_d_ptr));
}
voidrequest(int region, std::string url, bool verbose, tc::Headers http_headers)
{
std::string model_name = "yolo";
std::string model_version = "3";
std::string input_name = "input_data_" + std::to_string(region);
std::string output_name = "output_data_" + std::to_string(region);
// Create a InferenceServerGrpcClient instance to communicate with the// server using gRPC protocol.
std::unique_ptr<tc::InferenceServerGrpcClient> client;
FAIL_IF_ERR(
tc::InferenceServerGrpcClient::Create(&client, url, verbose),
"unable to create grpc client");
// Unregistering all shared memory regions for a clean// start.for (size_t i = 0; i < 3000; i++) {
FAIL_IF_ERR(
client->UnregisterCudaSharedMemory(input_name),
"unable to unregister all cuda shared memory regions");
FAIL_IF_ERR(
client->UnregisterCudaSharedMemory(output_name),
"unable to unregister all cuda shared memory regions");
int64_t batch_size = 1;
std::vector<int64_t> shape{batch_size, 3, 640, 384};
size_t input_byte_size = batch_size * 3 * 640 * 384 * 2;
size_t output_byte_size = batch_size * 846720; // magic number// Initialize the inputs with the data.
tc::InferInput* input0;
tc::InferInput* input1;
FAIL_IF_ERR(
tc::InferInput::Create(&input0, "images", shape, "FP16"),
"unable to get INPUT0");
std::shared_ptr<tc::InferInput> input0_ptr;
input0_ptr.reset(input0);
shortint input_data[batch_size * 3 * 640 * 384];
for (size_t i = 0; i < batch_size * 3 * 640 * 384; ++i) {
input_data[i] = 0;
}
// copy INPUT0 and INPUT1 data in GPU shared memoryint* input_d_ptr;
cudaMalloc((void**)&input_d_ptr, input_byte_size);
cudaMemcpy(
(void*)input_d_ptr, (void*)input_data, input_byte_size,
cudaMemcpyHostToDevice);
cudaIpcMemHandle_t input_cuda_handle;
CreateCUDAIPCHandle(&input_cuda_handle, (void*)input_d_ptr);
FAIL_IF_ERR(
client->RegisterCudaSharedMemory(
input_name, input_cuda_handle, 0/* device_id */, input_byte_size),
"failed to register input shared memory region");
FAIL_IF_ERR(
input0_ptr->SetSharedMemory(
input_name, input_byte_size, 0/* offset */),
"unable to set shared memory for INPUT0");
// Generate the outputs to be requested.
tc::InferRequestedOutput* output0;
FAIL_IF_ERR(
tc::InferRequestedOutput::Create(&output0, "output0"),
"unable to get 'OUTPUT0'");
std::shared_ptr<tc::InferRequestedOutput> output0_ptr;
output0_ptr.reset(output0);
// Create Output0 and Output1 in CUDA Shared Memoryint* output0_d_ptr;
cudaMalloc((void**)&output0_d_ptr, output_byte_size);
cudaIpcMemHandle_t output_cuda_handle;
CreateCUDAIPCHandle(&output_cuda_handle, (void*)output0_d_ptr);
FAIL_IF_ERR(
client->RegisterCudaSharedMemory(
output_name, output_cuda_handle, 0/* device_id */,
output_byte_size),
"failed to register output shared memory region");
FAIL_IF_ERR(
output0_ptr->SetSharedMemory(
output_name, output_byte_size, 0/* offset */),
"unable to set shared memory for 'OUTPUT0'");
// The inference settings. Will be using default for now.
tc::InferOptions options(model_name);
options.model_version_ = model_version;
std::vector<tc::InferInput*> inputs = {input0_ptr.get()};
std::vector<const tc::InferRequestedOutput*> outputs = {output0_ptr.get()};
tc::InferResult* results;
FAIL_IF_ERR(
client->Infer(&results, options, inputs, outputs, http_headers),
"unable to run model");
std::shared_ptr<tc::InferResult> results_ptr;
results_ptr.reset(results);
// Validate the results...// ValidateShapeAndDatatype("OUTPUT0", results_ptr);// ValidateShapeAndDatatype("OUTPUT1", results_ptr);// Copy input and output data back to the CPUshortint output0_data[output_byte_size / 2];
cudaMemcpy(
output0_data, output0_d_ptr, output_byte_size, cudaMemcpyDeviceToHost);
// Get shared memory regions active/registered within triton
inference::CudaSharedMemoryStatusResponse status;
FAIL_IF_ERR(
client->CudaSharedMemoryStatus(&status),
"failed to get shared memory status");
// std::cout << "Shared Memory Status:\n" << status.DebugString() << "\n";// Unregister shared memoryFAIL_IF_ERR(
client->UnregisterCudaSharedMemory(input_name),
"unable to unregister shared memory input region");
FAIL_IF_ERR(
client->UnregisterCudaSharedMemory(output_name),
"unable to unregister shared memory output region");
// Free GPU memoryFAIL_IF_CUDA_ERR(cudaFree(input_d_ptr));
FAIL_IF_CUDA_ERR(cudaFree(output0_d_ptr));
}
}
intmain(int argc, char** argv)
{
bool verbose = false;
std::string url("localhost:8001");
tc::Headers http_headers;
// Parse commandline...int opt;
while ((opt = getopt(argc, argv, "vu:H:")) != -1) {
switch (opt) {
case'v':
verbose = true;
break;
case'u':
url = optarg;
break;
case'H': {
std::string arg = optarg;
std::string header = arg.substr(0, arg.find(":"));
http_headers[header] = arg.substr(header.size() + 1);
break;
}
case'?':
Usage(argv);
break;
}
}
constint num_threads = 2000;
std::vector<std::thread> threads;
for (int i = 0; i < num_threads; ++i) {
threads.push_back(std::thread(request, i, url, verbose, http_headers));
}
for (auto& t : threads) {
t.join();
}
std::cout << "PASS : Cuda Shared Memory " << std::endl;
return0;
}
Techniques Tried: Multi-threading.
Observation: Throughput is approximately 10% of perf_analyzer results. GPU usage remains below 25%.
Issue
Performance Bottleneck: Both Python and C++ SDK clients achieve significantly lower throughput and higher latency than perf_analyzer under similar conditions.
Additional Observations:
Both Python (multi-threading and async) and C++ (multi-threading) clients achieve only about 10% of the throughput seen with perf_analyzer.
GPU Utilization: For both clients, GPU usage remains below 25%, indicating under-utilization of resources.
Expected Guidance:
Are there known limitations or bottlenecks when using the SDK compared to perf_analyzer?
Are there specific best practices to optimize SDK-based client implementations for maximum performance?
Is there a detailed example or reference implementation for achieving perf_analyzer-like performance using the SDK?
I appreciate any insights or recommendations on how to address this issue. Thank you!
The text was updated successfully, but these errors were encountered:
It's very slow to get response. Just the time spent on the gRPC call is being wasted here.
async def _extract_client_output(self, responses: AsyncResponseIterator):
"""
Extract and convert results from Triton response based on output_spec.
"""
results = []
async for response in responses:
current_result = (
value.to_string_array() if value.data_type == TRITONSERVER_DataType.BYTES else np.from_dlpack(value)
for key, value in response.outputs.items()
)
results.extend(current_result)
return results
Description:
I am observing significant performance differences between the Triton Client SDK (both Python and C++) and the perf_analyzer tool when testing with the same model configuration. I would like to understand the potential reasons for this performance gap and seek guidance on optimizing the SDK-based client implementations.
Model Configuration
perf_analyzer Command
perf_analyzer Results
SDK-Based Client Implementation
I have implemented clients using both the Python and C++ SDKs. The requests are configured to match the perf_analyzer setup as closely as possible. However, the throughput and latency are noticeably worse in comparison.
Python Client Code
Techniques Tried: asynchronous requests.
Observation: Throughput is approximately 10% of perf_analyzer results. GPU usage remains below 25%.
C++ Client Code
Techniques Tried: Multi-threading.
Observation: Throughput is approximately 10% of perf_analyzer results. GPU usage remains below 25%.
Issue
Performance Bottleneck: Both Python and C++ SDK clients achieve significantly lower throughput and higher latency than perf_analyzer under similar conditions.
Additional Observations:
Both Python (multi-threading and async) and C++ (multi-threading) clients achieve only about 10% of the throughput seen with perf_analyzer.
GPU Utilization: For both clients, GPU usage remains below 25%, indicating under-utilization of resources.
Expected Guidance:
Are there known limitations or bottlenecks when using the SDK compared to perf_analyzer?
Are there specific best practices to optimize SDK-based client implementations for maximum performance?
Is there a detailed example or reference implementation for achieving perf_analyzer-like performance using the SDK?
I appreciate any insights or recommendations on how to address this issue. Thank you!
The text was updated successfully, but these errors were encountered: