triton-inference-server · lkomali · Nov 7, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 18, 2024
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -130,6 +130,7 @@ set(
   data_loader.cc
   concurrency_manager.cc
   request_rate_manager.cc
+  custom_request_schedule_manager.cc
   load_worker.cc
   concurrency_worker.cc
   request_rate_worker.cc
@@ -160,6 +161,7 @@ set(
   data_loader.h
   concurrency_manager.h
   request_rate_manager.h
+  custom_request_schedule_manager.h
   custom_load_manager.h
   iworker.h
   load_worker.h

diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc
@@ -53,7 +53,7 @@ SplitString(const std::string& str, const std::string& delimiter = ":")
   std::vector<std::string> substrs;
   size_t pos = 0;
   while (pos != std::string::npos) {
-    size_t colon_pos = str.find(":", pos);
+    size_t colon_pos = str.find(delimiter, pos);
     substrs.push_back(str.substr(pos, colon_pos - pos));
     if (colon_pos == std::string::npos) {
       pos = colon_pos;
@@ -908,6 +908,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
       {"endpoint", required_argument, 0, long_option_idx_base + 61},
       {"request-count", required_argument, 0, long_option_idx_base + 62},
       {"warmup-request-count", required_argument, 0, long_option_idx_base + 63},
+      {"schedule", required_argument, 0, long_option_idx_base + 64},
       {0, 0, 0, 0}};
 
   // Parse commandline...
@@ -1647,7 +1648,9 @@ CLParser::ParseCommandLine(int argc, char** argv)
           if (std::stoi(optarg) < 0) {
             Usage("Failed to parse --request-count. The value must be > 0.");
           }
-          params_->request_count = std::stoi(optarg);
+          if (params_->request_count == 0) {
+            params_->request_count = std::stoi(optarg);
+          }
           break;
         }
         case long_option_idx_base + 63: {
@@ -1659,6 +1662,17 @@ CLParser::ParseCommandLine(int argc, char** argv)
           params_->warmup_request_count = std::stoi(optarg);
           break;
         }
+        case long_option_idx_base + 64: {
+          std::vector<float> schedule;
+          std::string arg = optarg;
+          std::vector<std::string> float_strings = SplitString(optarg, ",");
+          for (const std::string& str : float_strings) {
+            schedule.push_back(std::stof(str));
+          }
+          params_->schedule = schedule;
+          params_->request_count = schedule.size();
+          break;
+        }
         case 'v':
           params_->extra_verbose = params_->verbose;
           params_->verbose = true;
@@ -1977,9 +1991,13 @@ CLParser::VerifyOptions()
       Usage(
           "perf_analyzer supports only grpc protocol for TensorFlow Serving.");
     } else if (params_->streaming) {
-      Usage("perf_analyzer does not support streaming for TensorFlow Serving.");
+      Usage(
+          "perf_analyzer does not support streaming for TensorFlow "
+          "Serving.");
     } else if (params_->async) {
-      Usage("perf_analyzer does not support async API for TensorFlow Serving.");
+      Usage(
+          "perf_analyzer does not support async API for TensorFlow "
+          "Serving.");
     } else if (!params_->using_batch_size) {
       params_->batch_size = 0;
     }
@@ -2008,7 +2026,8 @@ CLParser::VerifyOptions()
     if (params_->async && params_->streaming &&
         params_->shared_memory_type != SharedMemoryType::NO_SHARED_MEMORY) {
       Usage(
-          "Cannot use --shared-memory=system or --shared-memory=cuda with "
+          "Cannot use --shared-memory=system or --shared-memory=cuda "
+          "with "
           "--service-kind=triton_c_api and --async and --streaming.");
     }
 

diff --git a/src/command_line_parser.h b/src/command_line_parser.h
@@ -157,6 +157,8 @@ struct PerfAnalyzerParameters {
   Range<uint64_t> periodic_concurrency_range{1, 1, 1};
   uint64_t request_period{10};
   size_t warmup_request_count{0};
+
+  std::vector<float> schedule{};
 };
 
 using PAParamsPtr = std::shared_ptr<PerfAnalyzerParameters>;

diff --git a/src/custom_request_schedule_manager.cc b/src/custom_request_schedule_manager.cc
@@ -0,0 +1,121 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "custom_request_schedule_manager.h"
+
+namespace triton::perfanalyzer {
+
+cb::Error
+CustomRequestScheduleManager::Create(
+    const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
+    const std::shared_ptr<cb::ClientBackendFactory>& factory,
+    std::unique_ptr<LoadManager>* manager)
+{
+  std::unique_ptr<CustomRequestScheduleManager> local_manager(
+      new CustomRequestScheduleManager(params, parser, factory));
+
+  *manager = std::move(local_manager);
+
+  return cb::Error::Success;
+}
+
+CustomRequestScheduleManager::CustomRequestScheduleManager(
+    const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
+    const std::shared_ptr<cb::ClientBackendFactory>& factory)
+    : RequestRateManager(
+          params->async, params->streaming, Distribution::CUSTOM,
+          params->batch_size, params->measurement_window_ms, params->max_trials,
+          params->max_threads, params->num_of_sequences,
+          params->shared_memory_type, params->output_shm_size,
+          params->serial_sequences, parser, factory,
+          params->request_parameters),
+      schedule_(params->schedule)
+{
+}
+
+cb::Error
+CustomRequestScheduleManager::PerformWarmup(
+    double request_rate, size_t warmup_request_count)
+{
+  if (warmup_request_count == 0) {
+    return cb::Error::Success;
+  }
+  RETURN_IF_ERROR(ChangeRequestRate(request_rate, warmup_request_count));
+  WaitForWarmupAndCleanup();
+  return cb::Error::Success;
+}
+
+cb::Error
+CustomRequestScheduleManager::ChangeRequestRate(
+    const double request_rate, const size_t request_count)
+{
+  PauseWorkers();
+  ConfigureThreads(request_count);
+  GenerateSchedule(request_rate, schedule_);
+  ResumeWorkers();
+
+  return cb::Error::Success;
+}
+
+void
+CustomRequestScheduleManager::GenerateSchedule(
+    const double request_rate, const std::vector<float>& schedule)
+{
+  std::vector<float> scaled_schedule;
+  scaled_schedule.reserve(schedule.size());
+  if (schedule.size() > 0) {
+    for (const auto& value : schedule) {
+      scaled_schedule.push_back(value / static_cast<float>(request_rate));
+    }
+  }
+  auto worker_schedules = CreateWorkerSchedules(schedule);
+  GiveSchedulesToWorkers(worker_schedules);
+}
+
+std::vector<RateSchedulePtr_t>
+CustomRequestScheduleManager::CreateWorkerSchedules(
+    const std::vector<float>& schedule)
+{
+  std::vector<RateSchedulePtr_t> worker_schedules =
+      CreateEmptyWorkerSchedules();
+  std::vector<size_t> thread_ids{CalculateThreadIds()};
+  std::chrono::nanoseconds next_timestamp(0);
+  size_t thread_id_index = 0;
+  size_t worker_index = 0;
+
+  for (const float& val : schedule) {
+    next_timestamp = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::duration<float>(val));
+    worker_index = thread_ids[thread_id_index];
+    thread_id_index = ++thread_id_index % thread_ids.size();
+    worker_schedules[worker_index]->intervals.emplace_back(next_timestamp);
+  }
+  SetScheduleDurations(worker_schedules);
+
+  return worker_schedules;
+}
+
+}  // namespace triton::perfanalyzer
diff --git a/src/custom_request_schedule_manager.h b/src/custom_request_schedule_manager.h
@@ -0,0 +1,109 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "command_line_parser.h"
+#include "load_manager.h"
+#include "request_rate_manager.h"
+
+namespace triton::perfanalyzer {
+
+//==============================================================================
+/// CustomRequestScheduleManager is a helper class to send inference requests to
+/// inference server in accordance with the schedule set by the user.
+///
+/// Detail:
+/// An instance of this load manager will be created at the beginning of the
+/// perf analyzer and it will be used to schedule to send requests at that
+/// particular second defined by the user. The particular seconds at which a
+/// request should be sent can be set by the user using the `schedule` option.
+/// For example, if the `schedule` is set to 1,2,4,5,6.5,
+/// CustomRequestScheduleManager sends request at 1st second, 2nd second, 4th
+/// second and so on.
+///
+
+class CustomRequestScheduleManager : public RequestRateManager {
+ public:
+  ~CustomRequestScheduleManager() = default;
+
+  /// Creates an object of CustomRequestScheduleManager
+  /// \param params A PAParamsPtr (std::shared_ptr<PerfAnalyzerParameters>) that
+  /// holds configuration parameters to create CustomRequestScheduleManager
+  /// object
+  ///
+  static cb::Error Create(
+      const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      std::unique_ptr<LoadManager>* manager);
+
+  /// Performs warmup for benchmarking by sending a fixed number of requests
+  /// according to the specified request rate
+  /// \param request_rate The rate at which requests must be issued to the
+  /// server \param warmup_request_count The number of warmup requests to send
+  /// \return cb::Error object indicating success or failure
+  cb::Error PerformWarmup(
+      double request_rate, size_t warmup_request_count) override;
+
+  /// Adjusts the rate of issuing requests to be the same as 'request_rate'
+  /// \param request_rate The rate at which requests must be issued to the
+  /// server \param request_count The number of requests to generate when
+  /// profiling \return cb::Error object indicating success or failure
+  cb::Error ChangeRequestRate(
+      const double request_rate, const size_t request_count) override;
+
+
+ protected:
+  /// Constructor for CustomRequestScheduleManager
+  ///
+  /// Initializes a CustomRequestScheduleManager instance using a PAParamsPtr
+  /// object that contains all necessary parameters for request scheduling.
+  ///
+  /// \param params A PAParamsPtr (std::shared_ptr<PerfAnalyzerParameters>) that
+  /// holds configuration parameters to create CustomRequestScheduleManager
+  /// object
+  ///
+  CustomRequestScheduleManager(
+      const pa::PAParamsPtr& params, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory);
+
+  /// Generates and updates the request schedule as per the given request rate
+  /// and schedule \param request_rate The request rate to use for new schedule
+  /// \param schedule The vector containing the schedule for requests
+  void GenerateSchedule(
+      const double request_rate, const std::vector<float>& schedule);
+
+  /// Creates worker schedules based on the provided schedule
+  /// \param duration The maximum duration for the schedule
+  /// \param schedule The vector containing the schedule for requests
+  /// \return A vector of RateSchedulePtr_t representing the worker schedules
+  std::vector<RateSchedulePtr_t> CreateWorkerSchedules(
+      const std::vector<float>& schedule);
+
+  /// The vector containing the schedule for requests
+  std::vector<float> schedule_;
+};
+
+}  // namespace triton::perfanalyzer
diff --git a/src/inference_profiler.cc b/src/inference_profiler.cc
@@ -40,7 +40,7 @@
 #include "constants.h"
 #include "doctest.h"
 
-namespace triton { namespace perfanalyzer {
+namespace triton::perfanalyzer {
 cb::Error
 ReportPrometheusMetrics(const Metrics& metrics)
 {
@@ -622,6 +622,7 @@ InferenceProfiler::Profile(
   is_stable = false;
   meets_threshold = true;
 
+
   RETURN_IF_ERROR(dynamic_cast<RequestRateManager*>(manager_.get())
                       ->PerformWarmup(request_rate, warmup_request_count));
   RETURN_IF_ERROR(dynamic_cast<RequestRateManager*>(manager_.get())
@@ -1872,4 +1873,4 @@ InferenceProfiler::MergeMetrics(
   return cb::Error::Success;
 }
 
-}}  // namespace triton::perfanalyzer
+}  // namespace triton::perfanalyzer
diff --git a/src/inference_profiler.h b/src/inference_profiler.h
@@ -39,6 +39,7 @@
 #include "concurrency_manager.h"
 #include "constants.h"
 #include "custom_load_manager.h"
+#include "custom_request_schedule_manager.h"
 #include "metrics.h"
 #include "metrics_manager.h"
 #include "model_parser.h"
@@ -47,7 +48,7 @@
 #include "profile_data_collector.h"
 #include "request_rate_manager.h"
 
-namespace triton { namespace perfanalyzer {
+namespace triton::perfanalyzer {
 
 #ifndef DOCTEST_CONFIG_DISABLE
 class NaggyMockInferenceProfiler;
@@ -443,6 +444,7 @@ class InferenceProfiler {
       std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
       bool& is_stable);
 
+
   /// A helper function for profiling functions.
   /// \param status_summary Returns the summary of the measurement.
   /// \param request_count The number of requests to generate when profiling. If
@@ -829,4 +831,4 @@ class InferenceProfiler {
 #endif
 };
 
-}}  // namespace triton::perfanalyzer
+}  // namespace triton::perfanalyzer