Skip to content

Commit

Permalink
PA: Fixed scheduler manager (#88)
Browse files Browse the repository at this point in the history
* Do I know what I am doing?

* compiles apparently

* Move to request_rate

* Fix small bugs

* more small fixes

* Maybe fix small issue

* Actually initialize empty

* Convert to const

* Drop old files

* Force request-count to equal schedule length

* Copy from GenAI branch

* Deformat

* Reorganize arg order

Fix pre-commit errors

* Fix errors, modify design

* fix pre-commit

Fix comments

---------

Co-authored-by: lkomali <[email protected]>
Co-authored-by: Harshini Komali <[email protected]>
  • Loading branch information
3 people authored Nov 7, 2024
1 parent a793b8b commit 5dfebe1
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 28 deletions.
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ set(
data_loader.cc
concurrency_manager.cc
request_rate_manager.cc
custom_request_schedule_manager.cc
load_worker.cc
concurrency_worker.cc
request_rate_worker.cc
Expand Down Expand Up @@ -160,6 +161,7 @@ set(
data_loader.h
concurrency_manager.h
request_rate_manager.h
custom_request_schedule_manager.h
custom_load_manager.h
iworker.h
load_worker.h
Expand Down
29 changes: 24 additions & 5 deletions src/command_line_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ SplitString(const std::string& str, const std::string& delimiter = ":")
std::vector<std::string> substrs;
size_t pos = 0;
while (pos != std::string::npos) {
size_t colon_pos = str.find(":", pos);
size_t colon_pos = str.find(delimiter, pos);
substrs.push_back(str.substr(pos, colon_pos - pos));
if (colon_pos == std::string::npos) {
pos = colon_pos;
Expand Down Expand Up @@ -908,6 +908,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
{"endpoint", required_argument, 0, long_option_idx_base + 61},
{"request-count", required_argument, 0, long_option_idx_base + 62},
{"warmup-request-count", required_argument, 0, long_option_idx_base + 63},
{"schedule", required_argument, 0, long_option_idx_base + 64},
{0, 0, 0, 0}};

// Parse commandline...
Expand Down Expand Up @@ -1647,7 +1648,9 @@ CLParser::ParseCommandLine(int argc, char** argv)
if (std::stoi(optarg) < 0) {
Usage("Failed to parse --request-count. The value must be > 0.");
}
params_->request_count = std::stoi(optarg);
if (params_->request_count == 0) {
params_->request_count = std::stoi(optarg);
}
break;
}
case long_option_idx_base + 63: {
Expand All @@ -1659,6 +1662,17 @@ CLParser::ParseCommandLine(int argc, char** argv)
params_->warmup_request_count = std::stoi(optarg);
break;
}
case long_option_idx_base + 64: {
std::vector<float> schedule;
std::string arg = optarg;
std::vector<std::string> float_strings = SplitString(optarg, ",");
for (const std::string& str : float_strings) {
schedule.push_back(std::stof(str));
}
params_->schedule = schedule;
params_->request_count = schedule.size();
break;
}
case 'v':
params_->extra_verbose = params_->verbose;
params_->verbose = true;
Expand Down Expand Up @@ -1977,9 +1991,13 @@ CLParser::VerifyOptions()
Usage(
"perf_analyzer supports only grpc protocol for TensorFlow Serving.");
} else if (params_->streaming) {
Usage("perf_analyzer does not support streaming for TensorFlow Serving.");
Usage(
"perf_analyzer does not support streaming for TensorFlow "
"Serving.");
} else if (params_->async) {
Usage("perf_analyzer does not support async API for TensorFlow Serving.");
Usage(
"perf_analyzer does not support async API for TensorFlow "
"Serving.");
} else if (!params_->using_batch_size) {
params_->batch_size = 0;
}
Expand Down Expand Up @@ -2008,7 +2026,8 @@ CLParser::VerifyOptions()
if (params_->async && params_->streaming &&
params_->shared_memory_type != SharedMemoryType::NO_SHARED_MEMORY) {
Usage(
"Cannot use --shared-memory=system or --shared-memory=cuda with "
"Cannot use --shared-memory=system or --shared-memory=cuda "
"with "
"--service-kind=triton_c_api and --async and --streaming.");
}

Expand Down
2 changes: 2 additions & 0 deletions src/command_line_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ struct PerfAnalyzerParameters {
Range<uint64_t> periodic_concurrency_range{1, 1, 1};
uint64_t request_period{10};
size_t warmup_request_count{0};

std::vector<float> schedule{};
};

using PAParamsPtr = std::shared_ptr<PerfAnalyzerParameters>;
Expand Down
121 changes: 121 additions & 0 deletions src/custom_request_schedule_manager.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "custom_request_schedule_manager.h"

namespace triton::perfanalyzer {

cb::Error
CustomRequestScheduleManager::Create(
const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
std::unique_ptr<LoadManager>* manager)
{
std::unique_ptr<CustomRequestScheduleManager> local_manager(
new CustomRequestScheduleManager(params, parser, factory));

*manager = std::move(local_manager);

return cb::Error::Success;
}

CustomRequestScheduleManager::CustomRequestScheduleManager(
const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory)
: RequestRateManager(
params->async, params->streaming, Distribution::CUSTOM,
params->batch_size, params->measurement_window_ms, params->max_trials,
params->max_threads, params->num_of_sequences,
params->shared_memory_type, params->output_shm_size,
params->serial_sequences, parser, factory,
params->request_parameters),
schedule_(params->schedule)
{
}

cb::Error
CustomRequestScheduleManager::PerformWarmup(
double request_rate, size_t warmup_request_count)
{
if (warmup_request_count == 0) {
return cb::Error::Success;
}
RETURN_IF_ERROR(ChangeRequestRate(request_rate, warmup_request_count));
WaitForWarmupAndCleanup();
return cb::Error::Success;
}

cb::Error
CustomRequestScheduleManager::ChangeRequestRate(
const double request_rate, const size_t request_count)
{
PauseWorkers();
ConfigureThreads(request_count);
GenerateSchedule(request_rate, schedule_);
ResumeWorkers();

return cb::Error::Success;
}

void
CustomRequestScheduleManager::GenerateSchedule(
const double request_rate, const std::vector<float>& schedule)
{
std::vector<float> scaled_schedule;
scaled_schedule.reserve(schedule.size());
if (schedule.size() > 0) {
for (const auto& value : schedule) {
scaled_schedule.push_back(value / static_cast<float>(request_rate));
}
}
auto worker_schedules = CreateWorkerSchedules(schedule);
GiveSchedulesToWorkers(worker_schedules);
}

std::vector<RateSchedulePtr_t>
CustomRequestScheduleManager::CreateWorkerSchedules(
const std::vector<float>& schedule)
{
std::vector<RateSchedulePtr_t> worker_schedules =
CreateEmptyWorkerSchedules();
std::vector<size_t> thread_ids{CalculateThreadIds()};
std::chrono::nanoseconds next_timestamp(0);
size_t thread_id_index = 0;
size_t worker_index = 0;

for (const float& val : schedule) {
next_timestamp = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::duration<float>(val));
worker_index = thread_ids[thread_id_index];
thread_id_index = ++thread_id_index % thread_ids.size();
worker_schedules[worker_index]->intervals.emplace_back(next_timestamp);
}
SetScheduleDurations(worker_schedules);

return worker_schedules;
}

} // namespace triton::perfanalyzer
109 changes: 109 additions & 0 deletions src/custom_request_schedule_manager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once

#include "command_line_parser.h"
#include "load_manager.h"
#include "request_rate_manager.h"

namespace triton::perfanalyzer {

//==============================================================================
/// CustomRequestScheduleManager is a helper class to send inference requests to
/// inference server in accordance with the schedule set by the user.
///
/// Detail:
/// An instance of this load manager will be created at the beginning of the
/// perf analyzer and it will be used to schedule to send requests at that
/// particular second defined by the user. The particular seconds at which a
/// request should be sent can be set by the user using the `schedule` option.
/// For example, if the `schedule` is set to 1,2,4,5,6.5,
/// CustomRequestScheduleManager sends request at 1st second, 2nd second, 4th
/// second and so on.
///

class CustomRequestScheduleManager : public RequestRateManager {
public:
~CustomRequestScheduleManager() = default;

/// Creates an object of CustomRequestScheduleManager
/// \param params A PAParamsPtr (std::shared_ptr<PerfAnalyzerParameters>) that
/// holds configuration parameters to create CustomRequestScheduleManager
/// object
///
static cb::Error Create(
const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
std::unique_ptr<LoadManager>* manager);

/// Performs warmup for benchmarking by sending a fixed number of requests
/// according to the specified request rate
/// \param request_rate The rate at which requests must be issued to the
/// server \param warmup_request_count The number of warmup requests to send
/// \return cb::Error object indicating success or failure
cb::Error PerformWarmup(
double request_rate, size_t warmup_request_count) override;

/// Adjusts the rate of issuing requests to be the same as 'request_rate'
/// \param request_rate The rate at which requests must be issued to the
/// server \param request_count The number of requests to generate when
/// profiling \return cb::Error object indicating success or failure
cb::Error ChangeRequestRate(
const double request_rate, const size_t request_count) override;


protected:
/// Constructor for CustomRequestScheduleManager
///
/// Initializes a CustomRequestScheduleManager instance using a PAParamsPtr
/// object that contains all necessary parameters for request scheduling.
///
/// \param params A PAParamsPtr (std::shared_ptr<PerfAnalyzerParameters>) that
/// holds configuration parameters to create CustomRequestScheduleManager
/// object
///
CustomRequestScheduleManager(
const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory);

/// Generates and updates the request schedule as per the given request rate
/// and schedule \param request_rate The request rate to use for new schedule
/// \param schedule The vector containing the schedule for requests
void GenerateSchedule(
const double request_rate, const std::vector<float>& schedule);

/// Creates worker schedules based on the provided schedule
/// \param duration The maximum duration for the schedule
/// \param schedule The vector containing the schedule for requests
/// \return A vector of RateSchedulePtr_t representing the worker schedules
std::vector<RateSchedulePtr_t> CreateWorkerSchedules(
const std::vector<float>& schedule);

/// The vector containing the schedule for requests
std::vector<float> schedule_;
};

} // namespace triton::perfanalyzer
5 changes: 3 additions & 2 deletions src/inference_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
#include "constants.h"
#include "doctest.h"

namespace triton { namespace perfanalyzer {
namespace triton::perfanalyzer {
cb::Error
ReportPrometheusMetrics(const Metrics& metrics)
{
Expand Down Expand Up @@ -622,6 +622,7 @@ InferenceProfiler::Profile(
is_stable = false;
meets_threshold = true;


RETURN_IF_ERROR(dynamic_cast<RequestRateManager*>(manager_.get())
->PerformWarmup(request_rate, warmup_request_count));
RETURN_IF_ERROR(dynamic_cast<RequestRateManager*>(manager_.get())
Expand Down Expand Up @@ -1872,4 +1873,4 @@ InferenceProfiler::MergeMetrics(
return cb::Error::Success;
}

}} // namespace triton::perfanalyzer
} // namespace triton::perfanalyzer
6 changes: 4 additions & 2 deletions src/inference_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "concurrency_manager.h"
#include "constants.h"
#include "custom_load_manager.h"
#include "custom_request_schedule_manager.h"
#include "metrics.h"
#include "metrics_manager.h"
#include "model_parser.h"
Expand All @@ -47,7 +48,7 @@
#include "profile_data_collector.h"
#include "request_rate_manager.h"

namespace triton { namespace perfanalyzer {
namespace triton::perfanalyzer {

#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockInferenceProfiler;
Expand Down Expand Up @@ -443,6 +444,7 @@ class InferenceProfiler {
std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
bool& is_stable);


/// A helper function for profiling functions.
/// \param status_summary Returns the summary of the measurement.
/// \param request_count The number of requests to generate when profiling. If
Expand Down Expand Up @@ -829,4 +831,4 @@ class InferenceProfiler {
#endif
};

}} // namespace triton::perfanalyzer
} // namespace triton::perfanalyzer
Loading

0 comments on commit 5dfebe1

Please sign in to comment.