Skip to content

Commit

Permalink
[common] add latency tracker based on hdr_histogram
Browse files Browse the repository at this point in the history
This tracker allows for tracking min,max,mean,stdev and arbitrary percentile values.

Calling TRACK_SCOPE_LATENCY(name) registers a latency tracker for a given scope.
All latency measurements are collected to a per-thread histogram instance.
When the program exits, all per-thread histograms (for the same scope) are
agregated into a single histogram and all statistics are printed.
  • Loading branch information
igchor committed Aug 5, 2024
1 parent 84f5e70 commit b93ecbb
Show file tree
Hide file tree
Showing 7 changed files with 264 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ jobs:
compiler: [{c: gcc, cxx: g++}]
libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF']
pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF']
latency_tracking: ['-DUMF_ENABLE_LATENCY_TRACKING=OFF']
include:
- os: 'ubuntu-22.04'
build_type: Release
Expand All @@ -36,6 +37,10 @@ jobs:
- os: 'ubuntu-20.04'
build_type: Release
compiler: {c: gcc-7, cxx: g++-7}
- os: 'ubuntu-22.04'
build_type: Release
compiler: {c: clang, cxx: clang++}
latency_tracking: '-DUMF_ENABLE_LATENCY_TRACKING=ON'
runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}

steps:
Expand Down Expand Up @@ -92,6 +97,7 @@ jobs:
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
${{matrix.libbacktrace}}
${{matrix.pool_tracking}}
${{matrix.latency_tracking}}
- name: Configure CMake
if: matrix.os == 'ubuntu-20.04'
Expand All @@ -106,6 +112,7 @@ jobs:
-DUR_FORMAT_CPP_STYLE=ON
${{matrix.libbacktrace}}
${{matrix.pool_tracking}}
${{matrix.latency_tracking}}
- name: Generate source from spec, check for uncommitted diff
if: matrix.os == 'ubuntu-22.04'
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON)
option(UR_STATIC_LOADER "Build loader as a static library" OFF)
option(UR_FORCE_LIBSTDCXX "Force use of libstdc++ in a build using libc++ on Linux" OFF)
option(UR_ENABLE_LATENCY_HISTOGRAM "Enable latncy histogram" OFF)
set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs")
set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
Expand Down
5 changes: 5 additions & 0 deletions source/adapters/level_zero/v2/command_list_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ command_list_cache_t::getImmediateCommandList(
ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal,
ze_command_queue_mode_t Mode, ze_command_queue_priority_t Priority,
std::optional<uint32_t> Index) {
TRACK_SCOPE_LATENCY("command_list_cache_t::getImmediateCommandList");

immediate_command_list_descriptor_t Desc;
Desc.ZeDevice = ZeDevice;
Desc.Ordinal = Ordinal;
Expand All @@ -100,12 +102,15 @@ command_list_cache_t::getImmediateCommandList(
raii::cache_borrowed_command_list_t
command_list_cache_t::getRegularCommandList(ze_device_handle_t ZeDevice,
bool IsInOrder, uint32_t Ordinal) {
TRACK_SCOPE_LATENCY("command_list_cache_t::getRegularCommandList");

regular_command_list_descriptor_t Desc;
Desc.ZeDevice = ZeDevice;
Desc.IsInOrder = IsInOrder;
Desc.Ordinal = Ordinal;

auto CommandList = getCommandList(Desc).release();

return raii::cache_borrowed_command_list_t(
CommandList, [Cache = this, Desc](ze_command_list_handle_t CmdList) {
Cache->addCommandList(
Expand Down
1 change: 1 addition & 0 deletions source/adapters/level_zero/v2/command_list_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <functional>
#include <stack>

#include "latency_tracker.hpp"
#include <ur/ur.hpp>
#include <ur_api.h>
#include <ze_api.h>
Expand Down
5 changes: 5 additions & 0 deletions source/adapters/level_zero/v2/event_provider_normal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "../common.hpp"
#include "../context.hpp"
#include "event_provider.hpp"
#include "latency_tracker.hpp"
#include "ur_api.h"
#include "ze_api.h"
#include <memory>
Expand Down Expand Up @@ -85,7 +86,10 @@ std::unique_ptr<provider_pool> provider_normal::createProviderPool() {
}

event_allocation provider_normal::allocate() {
TRACK_SCOPE_LATENCY("provider_normal::allocate");

if (pools.empty()) {
TRACK_SCOPE_LATENCY("provider_normal::allocate#createProviderPool");
pools.emplace_back(createProviderPool());
}

Expand All @@ -97,6 +101,7 @@ event_allocation provider_normal::allocate() {
}
}

TRACK_SCOPE_LATENCY("provider_normal::allocate#slowPath");
std::sort(pools.begin(), pools.end(), [](auto &a, auto &b) {
return a->nfree() < b->nfree(); // asceding
});
Expand Down
19 changes: 19 additions & 0 deletions source/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
add_ur_library(ur_common STATIC
ur_util.cpp
ur_util.hpp
latency_tracker.hpp
$<$<PLATFORM_ID:Windows>:windows/ur_lib_loader.cpp>
$<$<PLATFORM_ID:Linux,Darwin>:linux/ur_lib_loader.cpp>
)
Expand Down Expand Up @@ -45,6 +46,24 @@ set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool")
FetchContent_MakeAvailable(unified-memory-framework)
FetchContent_GetProperties(unified-memory-framework)

if(UR_ENABLE_LATENCY_HISTOGRAM)
set(HDR_HISTOGRAM_BUILD_STATIC CACHE INTERNAL ON "")
set(HDR_HISTOGRAM_BUILD_SHARED CACHE INTERNAL OFF "")

include(FetchContent)
FetchContent_Declare(hdr_histogram
GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git
GIT_TAG 0.11.8
)

FetchContent_MakeAvailable(hdr_histogram)
FetchContent_GetProperties(hdr_histogram)

target_link_libraries(ur_common PUBLIC hdr_histogram_static)
target_include_directories(ur_common PUBLIC $<BUILD_INTERFACE:${hdr_histogram_SOURCE_DIR}/include>)
target_compile_options(ur_common PUBLIC -DUR_ENABLE_LATENCY_HISTOGRAM=1)
endif()

target_link_libraries(ur_common PUBLIC
${CMAKE_DL_LIBS}
${PROJECT_NAME}::headers
Expand Down
226 changes: 226 additions & 0 deletions source/common/latency_tracker.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
//===--------- latency_tracker.cpp - common ------------------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <chrono>
#include <cstdint>
#include <limits>
#include <memory>

#include "logger/ur_logger.hpp"

#if defined(UR_ENABLE_LATENCY_HISTOGRAM)

#include <hdr/hdr_histogram.h>

static inline bool trackLatency = []() {
try {
auto map = getenv_to_map("UR_LOG_LATENCY");

if (!map.has_value()) {
return false;
}

auto it = map->find("level");
return it != map->end() &&
logger::str_to_level(it->second.front()) != logger::Level::QUIET;
} catch (...) {
return false;
}
}();

static constexpr size_t numPercentiles = 7;
static constexpr double percentiles[numPercentiles] = {
50.0, 90.0, 99.0, 99.9, 99.99, 99.999, 99.9999};

struct latencyValues {
int64_t min;
int64_t max;
int64_t mean;
int64_t stddev;
int64_t percentileValues[numPercentiles];
};

using histogram_ptr =
std::unique_ptr<struct hdr_histogram, decltype(&hdr_close)>;

static inline latencyValues getValues(const struct hdr_histogram *histogram) {
latencyValues values;
values.max = hdr_max(histogram);
values.min = hdr_min(histogram);
values.mean = static_cast<int64_t>(hdr_mean(histogram));
values.stddev = static_cast<int64_t>(hdr_stddev(histogram));

auto ret = hdr_value_at_percentiles(
histogram, percentiles, values.percentileValues, numPercentiles);
if (ret != 0) {
logger::error("Failed to get percentiles from latency histogram");
}

return values;
}

class latency_printer {
public:
latency_printer() : logger(logger::create_logger("latency", true, false)) {}

inline void publishLatency(const std::string &name,
histogram_ptr histogram) {
auto [it, inserted] = values.try_emplace(name, std::move(histogram));
if (!inserted) {
// combine histograms
hdr_add(it->second.get(), histogram.get());
}
}

inline ~latency_printer() {
if (trackLatency) {
print();
}
}

inline void print() {
printHeader();

for (auto &[name, histogram] : values) {
auto value = getValues(histogram.get());
logger.log(logger::Level::INFO,
"{},{},{},{},{},{},{},{},{},{},{},{},ns", name,
value.min, value.max, value.mean, value.stddev,
value.percentileValues[0], value.percentileValues[1],
value.percentileValues[2], value.percentileValues[3],
value.percentileValues[4], value.percentileValues[5],
value.percentileValues[6]);
}
}

private:
inline void printHeader() {
logger.log(logger::Level::INFO, "Latency histogram:");
logger.log(logger::Level::INFO,
"name,min,max,mean,stdev,p{},p{},p{},p{},p{},p{},p{},unit",
percentiles[0], percentiles[1], percentiles[2],
percentiles[3], percentiles[4], percentiles[5],
percentiles[6]);
}

std::map<std::string, histogram_ptr> values;
logger::Logger logger;
};

inline latency_printer &globalLatencyPrinter() {
static latency_printer printer;
return printer;
}

class latency_histogram {
public:
inline latency_histogram(const char *name,
latency_printer &printer = globalLatencyPrinter(),
int64_t lowestDiscernibleValue = 1,
int64_t highestTrackableValue = 100'000'000'000,
int significantFigures = 3)
: name(name), histogram(nullptr, nullptr), printer(printer) {
if (trackLatency) {
struct hdr_histogram *cHistogram;
auto ret = hdr_init(lowestDiscernibleValue, highestTrackableValue,
significantFigures, &cHistogram);
if (ret != 0) {
logger::error("Failed to initialize latency histogram");
}
histogram =
std::unique_ptr<struct hdr_histogram, decltype(&hdr_close)>(
cHistogram, &hdr_close);
}
}

latency_histogram(const latency_histogram &) = delete;
latency_histogram(latency_histogram &&) = delete;

inline ~latency_histogram() {
if (!trackLatency || !histogram) {
return;
}

if (hdr_min(histogram.get()) == std::numeric_limits<int64_t>::max()) {
logger::info("[{}] latency: no data", name);
return;
}

printer.publishLatency(name, std::move(histogram));
}

inline void trackValue(int64_t value) {
hdr_record_value(histogram.get(), value);
}

private:
const char *name;
histogram_ptr histogram;
latency_printer &printer;
};

class latency_tracker {
public:
inline explicit latency_tracker(latency_histogram &stats)
: stats(trackLatency ? &stats : nullptr), begin() {
if (trackLatency) {
begin = std::chrono::steady_clock::now();
}
}
inline latency_tracker() {}
inline ~latency_tracker() {
if (stats) {
auto tp = std::chrono::steady_clock::now();
auto diffNanos =
std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin)
.count();
stats->trackValue(static_cast<int64_t>(diffNanos));
}
}

latency_tracker(const latency_tracker &) = delete;
latency_tracker &operator=(const latency_tracker &) = delete;

inline latency_tracker(latency_tracker &&rhs) noexcept
: stats(rhs.stats), begin(rhs.begin) {
rhs.stats = nullptr;
}

inline latency_tracker &operator=(latency_tracker &&rhs) noexcept {
if (this != &rhs) {
this->~latency_tracker();
new (this) latency_tracker(std::move(rhs));
}
return *this;
}

private:
latency_histogram *stats{nullptr};
std::chrono::time_point<std::chrono::steady_clock> begin;
};

// To resolve __COUNTER__
#define CONCAT(a, b) a##b

// Each tracker has it's own thread-local histogram.
// At program exit, all histograms for the same scope are
// aggregated.
#define TRACK_SCOPE_LATENCY_CNT(name, cnt) \
static thread_local latency_histogram CONCAT(histogram, cnt)(name); \
latency_tracker CONCAT(tracker, cnt)(CONCAT(histogram, cnt));
#define TRACK_SCOPE_LATENCY(name) TRACK_SCOPE_LATENCY_CNT(name, __COUNTER__)

#else // UR_ENABLE_LATENCY_HISTOGRAM

#define TRACK_SCOPE_LATENCY(name)

#endif // UR_ENABLE_LATENCY_HISTOGRAM

0 comments on commit b93ecbb

Please sign in to comment.