Skip to content

Commit

Permalink
profiler: initial support for profiling graph ops
Browse files Browse the repository at this point in the history
  • Loading branch information
max-krasnyansky committed Dec 24, 2024
1 parent 9ba399d commit d4051c8
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 1 deletion.
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ option(GGML_CCACHE "ggml: use ccache if available" ON)
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
option(GGML_GPROF "ggml: enable gprof" OFF)
option(GGML_GRAPH_PROFILER "ggml: enable internal Graph and Op profiler" OFF)

# build
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
Expand Down
8 changes: 7 additions & 1 deletion ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
endif()

if (GGML_GRAPH_PROFILER)
add_compile_definitions(GGML_GRAPH_PROFILER)
endif()

if (NOT MSVC)
if (GGML_SANITIZE_THREAD)
add_compile_options(-fsanitize=thread)
Expand Down Expand Up @@ -215,7 +219,9 @@ add_library(ggml-base
ggml-threading.cpp
ggml-threading.h
ggml-quants.c
ggml-quants.h)
ggml-quants.h
ggml-profile.h
ggml-profile.cpp)

target_include_directories(ggml-base PRIVATE .)

Expand Down
16 changes: 16 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
#include "ggml-profile.h"
#include "amx/amx.h"
#include "ggml.h"

Expand Down Expand Up @@ -13585,6 +13586,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];

ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);

ggml_compute_forward(&params, node);

if (state->ith == 0 && cplan->abort_callback &&
Expand All @@ -13593,6 +13596,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
tp->ec = GGML_STATUS_ABORTED;
}

ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);

ggml_barrier(state->threadpool);

ggml_graph_profile_event(cgraph, GGML_PROF_OP_END, node_n, state->ith);
}

if (ggml_graph_profile_enabled(cgraph)) {
// need another barrier to flush the last timing update
ggml_barrier(state->threadpool);
}

Expand Down Expand Up @@ -13827,6 +13839,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
int n_threads = cplan->n_threads;
struct ggml_threadpool * threadpool = cplan->threadpool;

ggml_graph_profile_start(cgraph, n_threads);

bool disposable_threadpool = false;

if (threadpool == NULL) {
Expand Down Expand Up @@ -13878,6 +13892,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
// don't leave affinity set on the main thread
clear_numa_thread_affinity();

ggml_graph_profile_finish(cgraph, n_threads);

enum ggml_status ret = threadpool->ec;

if (disposable_threadpool) {
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_COUNT
};

struct ggml_profile_data;

struct ggml_cgraph {
int size; // maximum number of nodes/leafs/grads/grad_accs
int n_nodes; // number of nodes currently in use
Expand All @@ -292,6 +294,8 @@ struct ggml_cgraph {
struct ggml_tensor ** grad_accs; // accumulators for node gradients
struct ggml_tensor ** leafs; // tensors with constant data

struct ggml_profile_data * prof;

struct ggml_hash_set visited_hash_set;

enum ggml_cgraph_eval_order order;
Expand Down
177 changes: 177 additions & 0 deletions ggml/src/ggml-profile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#include "ggml-profile.h"

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <string>
#include <chrono>

#ifdef GGML_GRAPH_PROFILER

struct ggml_profile_output {
const char * prefix;
FILE * stream;
};

extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
{
// TODO: make this a param
const char *env = getenv("GGML_GRAPH_PROFILE");
if (!env) { return; }

// The number of threads may change between passes (pp vs tg).
// Allocate for max_n_threads for simplicity for now.
// TODO: use aligned allocator

size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS;
size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
size_t time_size = node_size * cg->n_nodes;
size_t t_size = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data);

uint8_t * ptr = (uint8_t *) malloc(t_size);
if (!ptr) {
fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
return;
}
memset(ptr, 0, t_size);

// init all pointers
cg->prof = (ggml_profile_data *) ptr; ptr += sizeof(ggml_profile_data);
cg->prof->output = (ggml_profile_output *) ptr; ptr += sizeof(ggml_profile_output);
cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
for (int i=0; i < cg->n_nodes; i++) {
cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
}

// init the output
ggml_profile_output *out = cg->prof->output;
if (!strcmp("stderr", env) || !strcmp("1", env)) {
out->prefix = "ggml-profile:";
out->stream = stderr;
} else {
out->prefix = "";
out->stream = fopen(env, "w");
}

}

extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
{
if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); }
if (!cg->prof) { return; }
}

static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t)
{
return sprintf(str, "%d:%d:%d:%d",
(int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]);
}

static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t)
{
char *p = str;

// append src0 and src1 (if any)
if (t->src[0]) {
p += ggml_profile_format_tensor_dims(p, t->src[0]);

for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += ggml_profile_format_tensor_dims(p, t->src[i]);
}

p += sprintf(p, " -> ");
}

// format self dims separately for better visual alignment
char self[64];
ggml_profile_format_tensor_dims(self, t);

p += sprintf(p, "%12s", self);
}

static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t)
{
char *p = str;

// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));

for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
}

p += sprintf(p, " -> ");
}

p += sprintf(p, "%3s", ggml_type_name(t->type));
}

extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
{
if (!cg->prof) { return; }

ggml_profile_output *out = cg->prof->output;

fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix);
fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix);

char dims[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];

for (int i = 0; i < cg->n_nodes; i++) {
uint64_t p_nsec = 0;
uint64_t s_nsec = 0;
uint64_t t_nsec = 0;

// add up per thread counters and reset them
for (int t=0; t < n_threads; t++) {
ggml_profile_timing &timing = cg->prof->timing[i][t];

p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START];
s_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_SYNC];
t_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_START];

timing.nsec[GGML_PROF_OP_START] = 0;
timing.nsec[GGML_PROF_OP_SYNC] = 0;
timing.nsec[GGML_PROF_OP_END] = 0;
}

ggml_profile_format_op_dims(dims, cg->nodes[i]);
ggml_profile_format_op_types(types, cg->nodes[i]);

fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix,
i, ggml_op_name(cg->nodes[i]->op),
(unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
dims, types, cg->nodes[i]->name);
}
fprintf(out->stream, "%s \n", out->prefix); // empty line to split tables
}

extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg)
{
if (!cg->prof) { return; }

ggml_profile_output *out = cg->prof->output;
if (out->stream != stderr) {
fclose(out->stream);
}

free(cg->prof); cg->prof = nullptr;
}

extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
{
if (!cg->prof) { return; }

using clock = std::chrono::high_resolution_clock;

ggml_profile_timing &timing = cg->prof->timing[node_n][ith];
timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
}

#endif // GGML_GRAPH_PROFILER
90 changes: 90 additions & 0 deletions ggml/src/ggml-profile.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#pragma once

#include "ggml-impl.h"

// GGML internal header

#ifdef __cplusplus
extern "C" {
#endif

// op profile events & timing (per op / per thread)
enum ggml_profile_event {
GGML_PROF_OP_START,
GGML_PROF_OP_SYNC,
GGML_PROF_OP_END
};

struct ggml_profile_timing {
uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
};

struct ggml_profile_output;

struct ggml_profile_data {
struct ggml_profile_output *output;
struct ggml_profile_timing ** timing; // per op / per thread timing
};

// check if profiling is enabled for this graph
static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg)
{
return cg->prof != NULL;
}

// get pointer to the timing data for specific node / thread
// can be used by the backends to populate data collected internally
static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith)
{
if (!cg->prof) { return NULL; }
return &cg->prof->timing[node_n][ith];
}

#ifndef GGML_GRAPH_PROFILER

// Stub out all profiler functions

static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
{
GGML_UNUSED(cg);
GGML_UNUSED(n_threads);
}

static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
{
GGML_UNUSED(cg);
GGML_UNUSED(n_threads);
}

static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
{
GGML_UNUSED(cg);
GGML_UNUSED(n_threads);
}

static inline void ggml_graph_profile_free(struct ggml_cgraph *cg)
{
GGML_UNUSED(cg);
}

static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
{
GGML_UNUSED(cg);
GGML_UNUSED(e);
GGML_UNUSED(node_n);
GGML_UNUSED(ith);
}

#else

void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
void ggml_graph_profile_free(struct ggml_cgraph *cg);
void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);

#endif // GGML_GRAPH_PROFILER

#ifdef __cplusplus
}
#endif
3 changes: 3 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

// FIXME: required here for quantization functions
#include "ggml-quants.h"
#include "ggml-profile.h"

#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
Expand Down Expand Up @@ -5884,6 +5885,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
/*.grads =*/ grads_ptr,
/*.grad_accs =*/ grad_accs_ptr,
/*.leafs =*/ leafs_ptr,
/*.prof =*/ NULL,
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
};
Expand All @@ -5910,6 +5912,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
/*.grads =*/ NULL, // gradients would need visited_hash_set
/*.grad_accs =*/ NULL,
/*.leafs =*/ NULL,
/*.prof =*/ NULL,
/*.visited_hash_set =*/ { 0, NULL, NULL },
/*.order =*/ cgraph0->order,
};
Expand Down

0 comments on commit d4051c8

Please sign in to comment.