From c7be29169ae54506f1b22f75252b526179f732a6 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Thu, 9 May 2024 02:21:53 +0000 Subject: [PATCH] [L0 v2] add latency tracker based on hdr_histogram This tracker allows for tracking min,max,mean,stdev and arbitrary percentile values. Calling TRACK_SCOPE_LATENCY(name) registers a latency tracker for a given scope. All latency measurements are collected to a per-thread histogram instance. When the program exits, all per-thread histograms (for the same scope) are agregated into a single histogram and all statistics are printed. --- .github/workflows/cmake.yml | 7 + CMakeLists.txt | 1 + .../level_zero/v2/command_list_cache.cpp | 5 + .../level_zero/v2/command_list_cache.hpp | 1 + .../level_zero/v2/event_provider_normal.cpp | 5 + source/common/CMakeLists.txt | 19 ++ source/common/latency_tracker.hpp | 226 ++++++++++++++++++ 7 files changed, 264 insertions(+) create mode 100644 source/common/latency_tracker.hpp diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 3b5ef70e19..ed0881e473 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -20,6 +20,7 @@ jobs: compiler: [{c: gcc, cxx: g++}] libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF'] pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF'] + latency_tracking: ['-DUMF_ENABLE_LATENCY_TRACKING=OFF'] include: - os: 'ubuntu-22.04' build_type: Release @@ -36,6 +37,10 @@ jobs: - os: 'ubuntu-20.04' build_type: Release compiler: {c: gcc-7, cxx: g++-7} + - os: 'ubuntu-22.04' + build_type: Release + compiler: {c: clang, cxx: clang++} + latency_tracking: '-DUMF_ENABLE_LATENCY_TRACKING=ON' runs-on: ${{matrix.os}} @@ -90,6 +95,7 @@ jobs: -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++ ${{matrix.libbacktrace}} ${{matrix.pool_tracking}} + ${{matrix.latency_tracking}} - name: Configure CMake if: matrix.os == 'ubuntu-20.04' @@ -104,6 +110,7 @@ jobs: -DUR_FORMAT_CPP_STYLE=ON ${{matrix.libbacktrace}} ${{matrix.pool_tracking}} + ${{matrix.latency_tracking}} - name: Generate source from spec, check for uncommitted diff if: matrix.os == 'ubuntu-22.04' diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fd39ea676..f662527a0f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF) option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON) option(UR_STATIC_LOADER "Build loader as a static library" OFF) +option(UR_ENABLE_LATENCY_HISTOGRAM "Enable latncy histogram" OFF) set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable") set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs") set(UR_SYCL_LIBRARY_DIR "" CACHE PATH diff --git a/source/adapters/level_zero/v2/command_list_cache.cpp b/source/adapters/level_zero/v2/command_list_cache.cpp index c240cc8ee7..0684885391 100644 --- a/source/adapters/level_zero/v2/command_list_cache.cpp +++ b/source/adapters/level_zero/v2/command_list_cache.cpp @@ -81,6 +81,8 @@ command_list_cache_t::getImmediateCommandList( ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal, ze_command_queue_mode_t Mode, ze_command_queue_priority_t Priority, std::optional Index) { + TRACK_SCOPE_LATENCY("command_list_cache_t::getImmediateCommandList"); + immediate_command_list_descriptor_t Desc; Desc.ZeDevice = ZeDevice; Desc.Ordinal = Ordinal; @@ -100,12 +102,15 @@ command_list_cache_t::getImmediateCommandList( raii::cache_borrowed_command_list_t command_list_cache_t::getRegularCommandList(ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal) { + TRACK_SCOPE_LATENCY("command_list_cache_t::getRegularCommandList"); + regular_command_list_descriptor_t Desc; Desc.ZeDevice = ZeDevice; Desc.IsInOrder = IsInOrder; Desc.Ordinal = Ordinal; auto CommandList = getCommandList(Desc).release(); + return raii::cache_borrowed_command_list_t( CommandList, [Cache = this, Desc](ze_command_list_handle_t CmdList) { Cache->addCommandList( diff --git a/source/adapters/level_zero/v2/command_list_cache.hpp b/source/adapters/level_zero/v2/command_list_cache.hpp index 7cacf40604..1850a4334c 100644 --- a/source/adapters/level_zero/v2/command_list_cache.hpp +++ b/source/adapters/level_zero/v2/command_list_cache.hpp @@ -12,6 +12,7 @@ #include #include +#include "latency_tracker.hpp" #include #include #include diff --git a/source/adapters/level_zero/v2/event_provider_normal.cpp b/source/adapters/level_zero/v2/event_provider_normal.cpp index 4e8287b36c..9299356a15 100644 --- a/source/adapters/level_zero/v2/event_provider_normal.cpp +++ b/source/adapters/level_zero/v2/event_provider_normal.cpp @@ -11,6 +11,7 @@ #include "../common.hpp" #include "../context.hpp" #include "event_provider.hpp" +#include "latency_tracker.hpp" #include "ur_api.h" #include "ze_api.h" #include @@ -85,7 +86,10 @@ std::unique_ptr provider_normal::createProviderPool() { } event_allocation provider_normal::allocate() { + TRACK_SCOPE_LATENCY("provider_normal::allocate"); + if (pools.empty()) { + TRACK_SCOPE_LATENCY("provider_normal::allocate#createProviderPool"); pools.emplace_back(createProviderPool()); } @@ -97,6 +101,7 @@ event_allocation provider_normal::allocate() { } } + TRACK_SCOPE_LATENCY("provider_normal::allocate#slowPath"); std::sort(pools.begin(), pools.end(), [](auto &a, auto &b) { return a->nfree() < b->nfree(); // asceding }); diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 222b7f008b..d9fd7f8afc 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -6,6 +6,7 @@ add_ur_library(ur_common STATIC ur_util.cpp ur_util.hpp + latency_tracker.hpp $<$:windows/ur_lib_loader.cpp> $<$:linux/ur_lib_loader.cpp> ) @@ -44,6 +45,24 @@ set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool") FetchContent_MakeAvailable(unified-memory-framework) FetchContent_GetProperties(unified-memory-framework) +if(UR_ENABLE_LATENCY_HISTOGRAM) + set(HDR_HISTOGRAM_BUILD_STATIC CACHE INTERNAL ON "") + set(HDR_HISTOGRAM_BUILD_SHARED CACHE INTERNAL OFF "") + + include(FetchContent) + FetchContent_Declare(hdr_histogram + GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git + GIT_TAG 0.11.8 + ) + + FetchContent_MakeAvailable(hdr_histogram) + FetchContent_GetProperties(hdr_histogram) + + target_link_libraries(ur_common PUBLIC hdr_histogram_static) + target_include_directories(ur_common PUBLIC $) + target_compile_options(ur_common PUBLIC -DUR_ENABLE_LATENCY_HISTOGRAM=1) +endif() + target_link_libraries(ur_common PUBLIC ${CMAKE_DL_LIBS} ${PROJECT_NAME}::headers diff --git a/source/common/latency_tracker.hpp b/source/common/latency_tracker.hpp new file mode 100644 index 0000000000..03ed6b6443 --- /dev/null +++ b/source/common/latency_tracker.hpp @@ -0,0 +1,226 @@ +//===--------- latency_tracker.cpp - common ------------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include "logger/ur_logger.hpp" + +#if defined(UR_ENABLE_LATENCY_HISTOGRAM) + +#include + +static inline bool trackLatency = []() { + try { + auto map = getenv_to_map("UR_LOG_LATENCY"); + + if (!map.has_value()) { + return false; + } + + auto it = map->find("level"); + return it != map->end() && + logger::str_to_level(it->second.front()) != logger::Level::QUIET; + } catch (...) { + return false; + } +}(); + +static constexpr size_t numPercentiles = 7; +static constexpr double percentiles[numPercentiles] = { + 50.0, 90.0, 99.0, 99.9, 99.99, 99.999, 99.9999}; + +struct latencyValues { + int64_t min; + int64_t max; + int64_t mean; + int64_t stddev; + int64_t percentileValues[numPercentiles]; +}; + +using histogram_ptr = + std::unique_ptr; + +static inline latencyValues getValues(const struct hdr_histogram *histogram) { + latencyValues values; + values.max = hdr_max(histogram); + values.min = hdr_min(histogram); + values.mean = static_cast(hdr_mean(histogram)); + values.stddev = static_cast(hdr_stddev(histogram)); + + auto ret = hdr_value_at_percentiles( + histogram, percentiles, values.percentileValues, numPercentiles); + if (ret != 0) { + logger::error("Failed to get percentiles from latency histogram"); + } + + return values; +} + +class latency_printer { + public: + latency_printer() : logger(logger::create_logger("latency", true, false)) {} + + inline void publishLatency(const std::string &name, + histogram_ptr histogram) { + auto [it, inserted] = values.try_emplace(name, std::move(histogram)); + if (!inserted) { + // combine histograms + hdr_add(it->second.get(), histogram.get()); + } + } + + inline ~latency_printer() { + if (trackLatency) { + print(); + } + } + + inline void print() { + printHeader(); + + for (auto &[name, histogram] : values) { + auto value = getValues(histogram.get()); + logger.log(logger::Level::INFO, + "{},{},{},{},{},{},{},{},{},{},{},{},ns", name, + value.min, value.max, value.mean, value.stddev, + value.percentileValues[0], value.percentileValues[1], + value.percentileValues[2], value.percentileValues[3], + value.percentileValues[4], value.percentileValues[5], + value.percentileValues[6]); + } + } + + private: + inline void printHeader() { + logger.log(logger::Level::INFO, "Latency histogram:"); + logger.log(logger::Level::INFO, + "name,min,max,mean,stdev,p{},p{},p{},p{},p{},p{},p{},unit", + percentiles[0], percentiles[1], percentiles[2], + percentiles[3], percentiles[4], percentiles[5], + percentiles[6]); + } + + std::map values; + logger::Logger logger; +}; + +inline latency_printer &globalLatencyPrinter() { + static latency_printer printer; + return printer; +} + +class latency_histogram { + public: + inline latency_histogram(const char *name, + latency_printer &printer = globalLatencyPrinter(), + int64_t lowestDiscernibleValue = 1, + int64_t highestTrackableValue = 100'000'000'000, + int significantFigures = 3) + : name(name), histogram(nullptr, nullptr), printer(printer) { + if (trackLatency) { + struct hdr_histogram *cHistogram; + auto ret = hdr_init(lowestDiscernibleValue, highestTrackableValue, + significantFigures, &cHistogram); + if (ret != 0) { + logger::error("Failed to initialize latency histogram"); + } + histogram = + std::unique_ptr( + cHistogram, &hdr_close); + } + } + + latency_histogram(const latency_histogram &) = delete; + latency_histogram(latency_histogram &&) = delete; + + inline ~latency_histogram() { + if (!trackLatency || !histogram) { + return; + } + + if (hdr_min(histogram.get()) == std::numeric_limits::max()) { + logger::info("[{}] latency: no data", name); + return; + } + + printer.publishLatency(name, std::move(histogram)); + } + + inline void trackValue(int64_t value) { + hdr_record_value(histogram.get(), value); + } + + private: + const char *name; + histogram_ptr histogram; + latency_printer &printer; +}; + +class latency_tracker { + public: + inline explicit latency_tracker(latency_histogram &stats) + : stats(trackLatency ? &stats : nullptr), begin() { + if (trackLatency) { + begin = std::chrono::steady_clock::now(); + } + } + inline latency_tracker() {} + inline ~latency_tracker() { + if (stats) { + auto tp = std::chrono::steady_clock::now(); + auto diffNanos = + std::chrono::duration_cast(tp - begin) + .count(); + stats->trackValue(static_cast(diffNanos)); + } + } + + latency_tracker(const latency_tracker &) = delete; + latency_tracker &operator=(const latency_tracker &) = delete; + + inline latency_tracker(latency_tracker &&rhs) noexcept + : stats(rhs.stats), begin(rhs.begin) { + rhs.stats = nullptr; + } + + inline latency_tracker &operator=(latency_tracker &&rhs) noexcept { + if (this != &rhs) { + this->~latency_tracker(); + new (this) latency_tracker(std::move(rhs)); + } + return *this; + } + + private: + latency_histogram *stats{nullptr}; + std::chrono::time_point begin; +}; + +// To resolve __COUNTER__ +#define CONCAT(a, b) a##b + +// Each tracker has it's own thread-local histogram. +// At program exit, all histograms for the same scope are +// aggregated. +#define TRACK_SCOPE_LATENCY_CNT(name, cnt) \ + static thread_local latency_histogram CONCAT(histogram, cnt)(name); \ + latency_tracker CONCAT(tracker, cnt)(CONCAT(histogram, cnt)); +#define TRACK_SCOPE_LATENCY(name) TRACK_SCOPE_LATENCY_CNT(name, __COUNTER__) + +#else // UR_ENABLE_LATENCY_HISTOGRAM + +#define TRACK_SCOPE_LATENCY(name) + +#endif // UR_ENABLE_LATENCY_HISTOGRAM