From d4051c81eed16830331fb831e36ec5b433064963 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Wed, 25 Sep 2024 14:25:13 -0700 Subject: [PATCH] profiler: initial support for profiling graph ops --- ggml/CMakeLists.txt | 1 + ggml/src/CMakeLists.txt | 8 +- ggml/src/ggml-cpu/ggml-cpu.c | 16 ++++ ggml/src/ggml-impl.h | 4 + ggml/src/ggml-profile.cpp | 177 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-profile.h | 90 ++++++++++++++++++ ggml/src/ggml.c | 3 + 7 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-profile.cpp create mode 100644 ggml/src/ggml-profile.h diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index e33d974827cbe..df356f0209735 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -83,6 +83,7 @@ option(GGML_CCACHE "ggml: use ccache if available" ON) option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON) option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF) option(GGML_GPROF "ggml: enable gprof" OFF) +option(GGML_GRAPH_PROFILER "ggml: enable internal Graph and Op profiler" OFF) # build option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index a5f7f7b5b85a6..e5ede94665574 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -7,6 +7,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) endif() +if (GGML_GRAPH_PROFILER) + add_compile_definitions(GGML_GRAPH_PROFILER) +endif() + if (NOT MSVC) if (GGML_SANITIZE_THREAD) add_compile_options(-fsanitize=thread) @@ -215,7 +219,9 @@ add_library(ggml-base ggml-threading.cpp ggml-threading.h ggml-quants.c - ggml-quants.h) + ggml-quants.h + ggml-profile.h + ggml-profile.cpp) target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b7fefb9ddfd89..be25ff00d78cd 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -10,6 +10,7 @@ #include "ggml-quants.h" #include "ggml-cpu-quants.h" #include "ggml-threading.h" +#include "ggml-profile.h" #include "amx/amx.h" #include "ggml.h" @@ -13585,6 +13586,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; + ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith); + ggml_compute_forward(¶ms, node); if (state->ith == 0 && cplan->abort_callback && @@ -13593,6 +13596,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { tp->ec = GGML_STATUS_ABORTED; } + ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith); + + ggml_barrier(state->threadpool); + + ggml_graph_profile_event(cgraph, GGML_PROF_OP_END, node_n, state->ith); + } + + if (ggml_graph_profile_enabled(cgraph)) { + // need another barrier to flush the last timing update ggml_barrier(state->threadpool); } @@ -13827,6 +13839,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl int n_threads = cplan->n_threads; struct ggml_threadpool * threadpool = cplan->threadpool; + ggml_graph_profile_start(cgraph, n_threads); + bool disposable_threadpool = false; if (threadpool == NULL) { @@ -13878,6 +13892,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl // don't leave affinity set on the main thread clear_numa_thread_affinity(); + ggml_graph_profile_finish(cgraph, n_threads); + enum ggml_status ret = threadpool->ec; if (disposable_threadpool) { diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 549772c57c90a..8dee7f183a96f 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -282,6 +282,8 @@ enum ggml_cgraph_eval_order { GGML_CGRAPH_EVAL_ORDER_COUNT }; +struct ggml_profile_data; + struct ggml_cgraph { int size; // maximum number of nodes/leafs/grads/grad_accs int n_nodes; // number of nodes currently in use @@ -292,6 +294,8 @@ struct ggml_cgraph { struct ggml_tensor ** grad_accs; // accumulators for node gradients struct ggml_tensor ** leafs; // tensors with constant data + struct ggml_profile_data * prof; + struct ggml_hash_set visited_hash_set; enum ggml_cgraph_eval_order order; diff --git a/ggml/src/ggml-profile.cpp b/ggml/src/ggml-profile.cpp new file mode 100644 index 0000000000000..bdafcde1f5313 --- /dev/null +++ b/ggml/src/ggml-profile.cpp @@ -0,0 +1,177 @@ +#include "ggml-profile.h" + +#include +#include +#include +#include + +#include +#include + +#ifdef GGML_GRAPH_PROFILER + +struct ggml_profile_output { + const char * prefix; + FILE * stream; +}; + +extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads) +{ + // TODO: make this a param + const char *env = getenv("GGML_GRAPH_PROFILE"); + if (!env) { return; } + + // The number of threads may change between passes (pp vs tg). + // Allocate for max_n_threads for simplicity for now. + // TODO: use aligned allocator + + size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS; + size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes; + size_t time_size = node_size * cg->n_nodes; + size_t t_size = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data); + + uint8_t * ptr = (uint8_t *) malloc(t_size); + if (!ptr) { + fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes); + return; + } + memset(ptr, 0, t_size); + + // init all pointers + cg->prof = (ggml_profile_data *) ptr; ptr += sizeof(ggml_profile_data); + cg->prof->output = (ggml_profile_output *) ptr; ptr += sizeof(ggml_profile_output); + cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size; + for (int i=0; i < cg->n_nodes; i++) { + cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size; + } + + // init the output + ggml_profile_output *out = cg->prof->output; + if (!strcmp("stderr", env) || !strcmp("1", env)) { + out->prefix = "ggml-profile:"; + out->stream = stderr; + } else { + out->prefix = ""; + out->stream = fopen(env, "w"); + } + +} + +extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads) +{ + if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); } + if (!cg->prof) { return; } +} + +static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t) +{ + return sprintf(str, "%d:%d:%d:%d", + (int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]); +} + +static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t) +{ + char *p = str; + + // append src0 and src1 (if any) + if (t->src[0]) { + p += ggml_profile_format_tensor_dims(p, t->src[0]); + + for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { + p += sprintf(p, " x "); + p += ggml_profile_format_tensor_dims(p, t->src[i]); + } + + p += sprintf(p, " -> "); + } + + // format self dims separately for better visual alignment + char self[64]; + ggml_profile_format_tensor_dims(self, t); + + p += sprintf(p, "%12s", self); +} + +static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t) +{ + char *p = str; + + // append src0 and src1 (if any) + if (t->src[0]) { + p += sprintf(p, "%s", ggml_type_name(t->src[0]->type)); + + for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", ggml_type_name(t->src[i]->type)); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%3s", ggml_type_name(t->type)); +} + +extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads) +{ + if (!cg->prof) { return; } + + ggml_profile_output *out = cg->prof->output; + + fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix); + fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix); + + char dims[64 * GGML_MAX_SRC]; + char types[16 * GGML_MAX_SRC]; + + for (int i = 0; i < cg->n_nodes; i++) { + uint64_t p_nsec = 0; + uint64_t s_nsec = 0; + uint64_t t_nsec = 0; + + // add up per thread counters and reset them + for (int t=0; t < n_threads; t++) { + ggml_profile_timing &timing = cg->prof->timing[i][t]; + + p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START]; + s_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_SYNC]; + t_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_START]; + + timing.nsec[GGML_PROF_OP_START] = 0; + timing.nsec[GGML_PROF_OP_SYNC] = 0; + timing.nsec[GGML_PROF_OP_END] = 0; + } + + ggml_profile_format_op_dims(dims, cg->nodes[i]); + ggml_profile_format_op_types(types, cg->nodes[i]); + + fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix, + i, ggml_op_name(cg->nodes[i]->op), + (unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec, + dims, types, cg->nodes[i]->name); + } + fprintf(out->stream, "%s \n", out->prefix); // empty line to split tables +} + +extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg) +{ + if (!cg->prof) { return; } + + ggml_profile_output *out = cg->prof->output; + if (out->stream != stderr) { + fclose(out->stream); + } + + free(cg->prof); cg->prof = nullptr; +} + +extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith) +{ + if (!cg->prof) { return; } + + using clock = std::chrono::high_resolution_clock; + + ggml_profile_timing &timing = cg->prof->timing[node_n][ith]; + timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); +} + +#endif // GGML_GRAPH_PROFILER diff --git a/ggml/src/ggml-profile.h b/ggml/src/ggml-profile.h new file mode 100644 index 0000000000000..3f8fecc08b101 --- /dev/null +++ b/ggml/src/ggml-profile.h @@ -0,0 +1,90 @@ +#pragma once + +#include "ggml-impl.h" + +// GGML internal header + +#ifdef __cplusplus +extern "C" { +#endif + +// op profile events & timing (per op / per thread) +enum ggml_profile_event { + GGML_PROF_OP_START, + GGML_PROF_OP_SYNC, + GGML_PROF_OP_END +}; + +struct ggml_profile_timing { + uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec +}; + +struct ggml_profile_output; + +struct ggml_profile_data { + struct ggml_profile_output *output; + struct ggml_profile_timing ** timing; // per op / per thread timing +}; + +// check if profiling is enabled for this graph +static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg) +{ + return cg->prof != NULL; +} + +// get pointer to the timing data for specific node / thread +// can be used by the backends to populate data collected internally +static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith) +{ + if (!cg->prof) { return NULL; } + return &cg->prof->timing[node_n][ith]; +} + +#ifndef GGML_GRAPH_PROFILER + +// Stub out all profiler functions + +static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads) +{ + GGML_UNUSED(cg); + GGML_UNUSED(n_threads); +} + +static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads) +{ + GGML_UNUSED(cg); + GGML_UNUSED(n_threads); +} + +static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads) +{ + GGML_UNUSED(cg); + GGML_UNUSED(n_threads); +} + +static inline void ggml_graph_profile_free(struct ggml_cgraph *cg) +{ + GGML_UNUSED(cg); +} + +static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith) +{ + GGML_UNUSED(cg); + GGML_UNUSED(e); + GGML_UNUSED(node_n); + GGML_UNUSED(ith); +} + +#else + +void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads); +void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads); +void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads); +void ggml_graph_profile_free(struct ggml_cgraph *cg); +void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith); + +#endif // GGML_GRAPH_PROFILER + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2bbe5f48257b2..bf15e48daa555 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -8,6 +8,7 @@ // FIXME: required here for quantization functions #include "ggml-quants.h" +#include "ggml-profile.h" #ifdef GGML_USE_CPU_HBM #include @@ -5884,6 +5885,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.grads =*/ grads_ptr, /*.grad_accs =*/ grad_accs_ptr, /*.leafs =*/ leafs_ptr, + /*.prof =*/ NULL, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; @@ -5910,6 +5912,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.grads =*/ NULL, // gradients would need visited_hash_set /*.grad_accs =*/ NULL, /*.leafs =*/ NULL, + /*.prof =*/ NULL, /*.visited_hash_set =*/ { 0, NULL, NULL }, /*.order =*/ cgraph0->order, };