From ceb28633800ebe55bb2b181361c962e1eac42dcf Mon Sep 17 00:00:00 2001
From: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
Date: Wed, 4 Sep 2024 03:41:23 -0700
Subject: [PATCH] [PAL/Linux-SGX] Print SGX stats on SIGUSR1 and reset them

This commit adds support to dump and reset SGX-related statistics
interactively, using SIGUSR1 signal. This helps to collect SGX-related
statistics only for a particular period, e.g. skipping the Gramine
startup and application initialization time and concentrating only on
the actual application processing.

The printed-out statistics are not precise, as the "stats collecting"
thread may run in parallel with other threads that update the
statistics. However, this imprecise implementation is simple and enough
for perf analysis.

This commit also breaks compatibility: SGX statistics can now only be
collected and printed when Gramine is built in debug or debugoptimized
mode. However, this should not affect users as SGX stats is tailored for
manual debugging and profiling sessions.

Co-authored-by: TejaswineeL <tejaswinee.ramdas.langhe@intel.com>
Signed-off-by: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
Signed-off-by: TejaswineeL <tejaswinee.ramdas.langhe@intel.com>
---
 Documentation/manifest-syntax.rst             |   4 +
 Documentation/performance.rst                 |  18 ++-
 .../fork_and_access_file.manifest.template    |   3 +
 .../multi_pthread.manifest.template           |   1 -
 .../multi_pthread_exitless.manifest.template  |   1 -
 pal/regression/Thread2.manifest.template      |   1 -
 .../Thread2_exitless.manifest.template        |   1 -
 pal/src/host/linux-sgx/host_entry.S           |   5 +
 pal/src/host/linux-sgx/host_exception.c       |  19 +++
 pal/src/host/linux-sgx/host_internal.h        |   1 -
 pal/src/host/linux-sgx/host_main.c            |   8 ++
 pal/src/host/linux-sgx/host_ocalls.c          |   7 +-
 pal/src/host/linux-sgx/host_thread.c          | 115 ++++++++++++++----
 pal/src/host/linux-sgx/pal_tcb.h              |   8 +-
 14 files changed, 152 insertions(+), 40 deletions(-)

diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst
index 772ef48ed9..c531139ff0 100644
--- a/Documentation/manifest-syntax.rst
+++ b/Documentation/manifest-syntax.rst
@@ -1293,6 +1293,10 @@ This syntax specifies whether to enable SGX enclave-specific statistics:
    includes creating the enclave, adding enclave pages, measuring them and
    initializing the enclave.
 
+For this option to take effect, Gramine must be compiled with
+``--buildtype=debug`` or ``--buildtype=debugoptimized``. Otherwise (if built in
+release mode), Gramine will exit with an error.
+
 .. warning::
    This option is insecure and cannot be used with production enclaves
    (``sgx.debug = false``). If a production enclave is started with this option
diff --git a/Documentation/performance.rst b/Documentation/performance.rst
index 44ad2de509..6cbb4206d0 100644
--- a/Documentation/performance.rst
+++ b/Documentation/performance.rst
@@ -18,11 +18,13 @@ Enabling SGX enclave stats
 
 See also :ref:`perf` below for installing ``perf``.
 
-Enable statistics using ``sgx.enable_stats = true`` manifest option. Now your
-graminized application correctly reports performance counters. This is useful
-when using e.g. ``perf stat`` to collect performance statistics. This manifest
-option also forces Gramine to dump SGX-related information on each enclave exit.
-Here is an example:
+Enable statistics using ``sgx.enable_stats = true`` manifest option (note that
+Gramine must be compiled with ``--buildtype=debug`` or
+``--buildtype=debugoptimized`` for this option to work). Now your graminized
+application correctly reports performance counters. This is useful when using
+e.g. ``perf stat`` to collect performance statistics. This manifest option also
+forces Gramine to dump SGX-related information on each enclave exit. Here is an
+example:
 
 ::
 
@@ -90,6 +92,12 @@ How to read this output:
    counters should be compared against "golden runs" to deduce any interesting
    trends.
 
+It is also possible to dump and reset SGX-related statistics interactively, using
+``SIGUSR1`` signal. This helps to collect SGX-related statistics only for a
+particular period, e.g. skipping the Gramine startup and application
+initialization time and concentrating only on the actual application processing.
+Send ``SIGUSR1`` using command ``kill -SIGUSR1 -<PGID>``.
+
 Effects of system calls / ocalls
 --------------------------------
 
diff --git a/libos/test/regression/fork_and_access_file.manifest.template b/libos/test/regression/fork_and_access_file.manifest.template
index 0f78cc750b..8f63bf4106 100644
--- a/libos/test/regression/fork_and_access_file.manifest.template
+++ b/libos/test/regression/fork_and_access_file.manifest.template
@@ -11,6 +11,9 @@ sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '16' }}
 sgx.debug = true
 sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
 
+# this is only to test that `sgx.enable_stats` works (it can only be specified for debug-mode tests)
+sgx.enable_stats = true
+
 sgx.trusted_files = [
   "file:{{ gramine.runtimedir(libc) }}/",
   "file:{{ binary_dir }}/{{ entrypoint }}",
diff --git a/libos/test/regression/multi_pthread.manifest.template b/libos/test/regression/multi_pthread.manifest.template
index 5f43081db8..2c6240bd7a 100644
--- a/libos/test/regression/multi_pthread.manifest.template
+++ b/libos/test/regression/multi_pthread.manifest.template
@@ -12,7 +12,6 @@ sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '8' }}
 
 sgx.debug = true
 sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
-sgx.enable_stats = true
 
 sgx.trusted_files = [
   "file:{{ gramine.runtimedir(libc) }}/",
diff --git a/libos/test/regression/multi_pthread_exitless.manifest.template b/libos/test/regression/multi_pthread_exitless.manifest.template
index 4f4ad2c00d..f2f92753db 100644
--- a/libos/test/regression/multi_pthread_exitless.manifest.template
+++ b/libos/test/regression/multi_pthread_exitless.manifest.template
@@ -13,7 +13,6 @@ sgx.insecure__rpc_thread_num = 8
 
 sgx.debug = true
 sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
-sgx.enable_stats = true
 
 sgx.trusted_files = [
   "file:{{ gramine.runtimedir(libc) }}/",
diff --git a/pal/regression/Thread2.manifest.template b/pal/regression/Thread2.manifest.template
index fe216e7cdb..8e9926f044 100644
--- a/pal/regression/Thread2.manifest.template
+++ b/pal/regression/Thread2.manifest.template
@@ -1,6 +1,5 @@
 loader.entrypoint.uri = "file:{{ binary_dir }}/{{ entrypoint }}"
 
 sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '2' }}
-sgx.enable_stats = true
 sgx.debug = true
 sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
diff --git a/pal/regression/Thread2_exitless.manifest.template b/pal/regression/Thread2_exitless.manifest.template
index 49557336b0..d416edd8d2 100644
--- a/pal/regression/Thread2_exitless.manifest.template
+++ b/pal/regression/Thread2_exitless.manifest.template
@@ -4,6 +4,5 @@ loader.entrypoint.uri = "file:{{ binary_dir }}/{{ entrypoint }}"
 
 sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '2' }}
 sgx.insecure__rpc_thread_num = 2
-sgx.enable_stats = true
 sgx.debug = true
 sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
diff --git a/pal/src/host/linux-sgx/host_entry.S b/pal/src/host/linux-sgx/host_entry.S
index 97a3ebeec5..654f205ec8 100644
--- a/pal/src/host/linux-sgx/host_entry.S
+++ b/pal/src/host/linux-sgx/host_entry.S
@@ -16,6 +16,7 @@
 
     .extern tcs_base
     .extern g_in_aex_profiling
+    .extern maybe_dump_and_reset_stats
 
     .global sgx_ecall
     .type sgx_ecall, @function
@@ -102,6 +103,8 @@ async_exit_pointer:
     movq %rbx, %rdi
     call sgx_profile_sample_aex
 
+    call maybe_dump_and_reset_stats
+
     # Restore stack
     movq %rbp, %rsp
     .cfi_def_cfa_register %rsp
@@ -169,6 +172,8 @@ sgx_raise:
     movq %rdx, %rdi
     call sgx_profile_sample_ocall_inner
 
+    call maybe_dump_and_reset_stats
+
     # Restore RDI
     movq -8(%rbp), %rdi
 #else
diff --git a/pal/src/host/linux-sgx/host_exception.c b/pal/src/host/linux-sgx/host_exception.c
index 0eea389dde..63a0abf6d0 100644
--- a/pal/src/host/linux-sgx/host_exception.c
+++ b/pal/src/host/linux-sgx/host_exception.c
@@ -17,6 +17,7 @@
 #include "debug_map.h"
 #include "host_internal.h"
 #include "pal_rpc_queue.h"
+#include "pal_tcb.h"
 #include "sigreturn.h"
 #include "sigset.h"
 #include "ucontext.h"
@@ -188,6 +189,11 @@ static void handle_sigusr1(int signum, siginfo_t* info, struct ucontext* uc) {
     __UNUSED(info);
     __UNUSED(uc);
 
+    if (g_sgx_enable_stats) {
+        PAL_HOST_TCB* tcb = pal_get_host_tcb();
+        __atomic_store_n(&tcb->reset_stats, true, __ATOMIC_RELAXED);
+    }
+
     if (g_pal_enclave.profile_enable) {
         __atomic_store_n(&g_trigger_profile_reinit, true, __ATOMIC_RELEASE);
     }
@@ -268,3 +274,16 @@ void pal_describe_location(uintptr_t addr, char* buf, size_t buf_size) {
 #endif
     default_describe_location(addr, buf, buf_size);
 }
+
+#ifdef DEBUG
+/* called on each AEX and OCALL (in normal context), see host_entry.S */
+void maybe_dump_and_reset_stats(void) {
+    if (!g_sgx_enable_stats)
+        return;
+
+    PAL_HOST_TCB* tcb = pal_get_host_tcb();
+    if (__atomic_exchange_n(&tcb->reset_stats, false, __ATOMIC_RELAXED) == true) {
+        collect_and_print_sgx_stats();
+    }
+}
+#endif
diff --git a/pal/src/host/linux-sgx/host_internal.h b/pal/src/host/linux-sgx/host_internal.h
index c7699924df..bf400954bc 100644
--- a/pal/src/host/linux-sgx/host_internal.h
+++ b/pal/src/host/linux-sgx/host_internal.h
@@ -122,7 +122,6 @@ int clone_thread(void* dynamic_tcs);
 
 int create_tcs_mapper(void* tcs_base, unsigned int thread_num);
 int pal_thread_init(void* tcbptr);
-void map_tcs(unsigned int tid);
 void unmap_my_tcs(void);
 int current_enclave_thread_cnt(void);
 void thread_exit(int status);
diff --git a/pal/src/host/linux-sgx/host_main.c b/pal/src/host/linux-sgx/host_main.c
index d034e7ecdf..33b29878c4 100644
--- a/pal/src/host/linux-sgx/host_main.c
+++ b/pal/src/host/linux-sgx/host_main.c
@@ -753,6 +753,14 @@ static int parse_loader_config(char* manifest, struct pal_enclave* enclave_info,
         goto out;
     }
 
+#ifndef DEBUG
+    if (g_sgx_enable_stats) {
+        log_error("'sgx.enable_stats = true' is specified in non-debug mode, this is disallowed");
+        ret = -EINVAL;
+        goto out;
+    }
+#endif
+
     ret = toml_string_in(manifest_root, "sgx.sigfile", &dummy_sigfile_str);
     if (ret < 0 || dummy_sigfile_str) {
         log_error("sgx.sigfile is not supported anymore. Please update your manifest according to "
diff --git a/pal/src/host/linux-sgx/host_ocalls.c b/pal/src/host/linux-sgx/host_ocalls.c
index 45ac7e3f4a..31b199c0c9 100644
--- a/pal/src/host/linux-sgx/host_ocalls.c
+++ b/pal/src/host/linux-sgx/host_ocalls.c
@@ -31,9 +31,8 @@
 rpc_queue_t* g_rpc_queue = NULL; /* pointer to untrusted queue */
 
 static noreturn void process_exit(int exitcode) {
-    update_sgx_stats(/*do_print=*/true);
-
 #ifdef DEBUG
+    update_sgx_stats_on_exit(/*do_print=*/true);
     sgx_profile_finish();
 #endif
 
@@ -73,7 +72,9 @@ static long sgx_ocall_exit(void* args) {
         process_exit((int)ocall_exit_args->exitcode);
     }
 
-    update_sgx_stats(/*do_print=*/false);
+#ifdef DEBUG
+    update_sgx_stats_on_exit(/*do_print=*/false);
+#endif
     thread_exit((int)ocall_exit_args->exitcode);
     return 0;
 }
diff --git a/pal/src/host/linux-sgx/host_thread.c b/pal/src/host/linux-sgx/host_thread.c
index e5f89bc52d..70384c72d3 100644
--- a/pal/src/host/linux-sgx/host_thread.c
+++ b/pal/src/host/linux-sgx/host_thread.c
@@ -5,6 +5,7 @@
 #include <asm/prctl.h>
 #include <asm/signal.h>
 #include <linux/signal.h>
+#include <stdatomic.h>
 
 #include "asan.h"
 #include "assert.h"
@@ -16,6 +17,7 @@
 struct enclave_thread_map {
     unsigned int    tid;
     sgx_arch_tcs_t* tcs;
+    PAL_HOST_TCB*   tcb;
 };
 
 static struct enclave_thread_map* g_enclave_thread_map = NULL;
@@ -26,36 +28,92 @@ static size_t g_enclave_thread_num = 0;
 
 bool g_sgx_enable_stats = false;
 
-/* this function is called only on thread/process exit (never in the middle of thread exec) */
-void update_sgx_stats(bool do_print) {
-    static atomic_ulong g_eenter_cnt       = 0;
-    static atomic_ulong g_eexit_cnt        = 0;
-    static atomic_ulong g_aex_cnt          = 0;
-    static atomic_ulong g_sync_signal_cnt  = 0;
-    static atomic_ulong g_async_signal_cnt = 0;
+/* protected by g_enclave_thread_map_lock to prevent data races when one thread exits (and updates
+ * these global SGX counters) and another thread dumps (reads) these SGX counters */
+static uint64_t g_eenter_cnt       = 0;
+static uint64_t g_eexit_cnt        = 0;
+static uint64_t g_aex_cnt          = 0;
+static uint64_t g_sync_signal_cnt  = 0;
+static uint64_t g_async_signal_cnt = 0;
+
+static void print_global_sgx_stats(void) {
+    assert(spinlock_is_locked(&g_enclave_thread_map_lock));
+
+    int pid = g_host_pid;
+    assert(pid > 0);
+
+    log_always("----- Total SGX stats for process %d -----\n"
+               "  # of EENTERs:        %lu\n"
+               "  # of EEXITs:         %lu\n"
+               "  # of AEXs:           %lu\n"
+               "  # of sync signals:   %lu\n"
+               "  # of async signals:  %lu",
+               pid, g_eenter_cnt, g_eexit_cnt, g_aex_cnt,
+               g_sync_signal_cnt, g_async_signal_cnt);
+}
 
+static void reset_global_sgx_stats(void) {
+    assert(spinlock_is_locked(&g_enclave_thread_map_lock));
+
+    g_eenter_cnt       = 0;
+    g_eexit_cnt        = 0;
+    g_aex_cnt          = 0;
+    g_sync_signal_cnt  = 0;
+    g_async_signal_cnt = 0;
+}
+
+static void update_global_sgx_stats_from_thread_stats(PAL_HOST_TCB* tcb) {
+    assert(spinlock_is_locked(&g_enclave_thread_map_lock));
+
+    g_eenter_cnt       += atomic_exchange_explicit(&tcb->eenter_cnt, 0, memory_order_relaxed);
+    g_eexit_cnt        += atomic_exchange_explicit(&tcb->eexit_cnt, 0, memory_order_relaxed);
+    g_aex_cnt          += atomic_exchange_explicit(&tcb->aex_cnt, 0, memory_order_relaxed);
+    g_sync_signal_cnt  += atomic_exchange_explicit(&tcb->sync_signal_cnt, 0, memory_order_relaxed);
+    g_async_signal_cnt += atomic_exchange_explicit(&tcb->async_signal_cnt, 0, memory_order_relaxed);
+}
+
+/* this function is called only on thread/process exit (never in the middle of thread exec) */
+void update_sgx_stats_on_exit(bool do_print) {
     if (!g_sgx_enable_stats)
         return;
 
+    spinlock_lock(&g_enclave_thread_map_lock);
+
     PAL_HOST_TCB* tcb = pal_get_host_tcb();
-    g_eenter_cnt       += tcb->eenter_cnt;
-    g_eexit_cnt        += tcb->eexit_cnt;
-    g_aex_cnt          += tcb->aex_cnt;
-    g_sync_signal_cnt  += tcb->sync_signal_cnt;
-    g_async_signal_cnt += tcb->async_signal_cnt;
-
-    if (do_print) {
-        int pid = g_host_pid;
-        assert(pid > 0);
-        log_always("----- Total SGX stats for process %d -----\n"
-                   "  # of EENTERs:        %lu\n"
-                   "  # of EEXITs:         %lu\n"
-                   "  # of AEXs:           %lu\n"
-                   "  # of sync signals:   %lu\n"
-                   "  # of async signals:  %lu",
-                   pid, g_eenter_cnt, g_eexit_cnt, g_aex_cnt,
-                   g_sync_signal_cnt, g_async_signal_cnt);
+    update_global_sgx_stats_from_thread_stats(tcb);
+
+    if (do_print)
+        print_global_sgx_stats();
+
+    spinlock_unlock(&g_enclave_thread_map_lock);
+}
+
+void collect_and_print_sgx_stats(void) {
+    if (!g_sgx_enable_stats)
+        return;
+
+    /*
+     * This function is executed by the "SGX-stats-collecting" thread (that received SIGUSR1). Thus,
+     * this thread is able to peek into the local storage of other threads. This is typically
+     * considered a bad smell (one thread reads local data of another thread), but here it's a
+     * reasonable trade-off: most of the accesses to the thread-local SGX counters are done on EEXIT
+     * and AEX events by the thread itself, so the memory access should be as simple as possible and
+     * as fast as possible. An alternative would be to move all SGX stats in a shared array, then we
+     * would have false cache sharing and complex memory management of the shared array.
+     */
+    spinlock_lock(&g_enclave_thread_map_lock);
+    for (size_t i = 0; i < g_enclave_thread_num; i++) {
+        if (!g_enclave_thread_map[i].tcs || !g_enclave_thread_map[i].tid)
+            continue;
+
+        PAL_HOST_TCB* tcb = g_enclave_thread_map[i].tcb;
+        update_global_sgx_stats_from_thread_stats(tcb);
     }
+
+    print_global_sgx_stats();
+    reset_global_sgx_stats();
+
+    spinlock_unlock(&g_enclave_thread_map_lock);
 }
 
 void pal_host_tcb_init(PAL_HOST_TCB* tcb, void* stack, void* alt_stack) {
@@ -69,6 +127,7 @@ void pal_host_tcb_init(PAL_HOST_TCB* tcb, void* stack, void* alt_stack) {
     tcb->aex_cnt          = 0;
     tcb->sync_signal_cnt  = 0;
     tcb->async_signal_cnt = 0;
+    tcb->reset_stats      = false;
 
     tcb->profile_sample_time = 0;
 
@@ -86,6 +145,7 @@ int create_tcs_mapper(void* tcs_base, unsigned int thread_num) {
     for (uint32_t i = 0; i < thread_num; i++) {
         g_enclave_thread_map[i].tid = 0;
         g_enclave_thread_map[i].tcs = &enclave_tcs[i];
+        g_enclave_thread_map[i].tcb = NULL;
     }
     g_enclave_thread_num = thread_num;
     return 0;
@@ -147,7 +207,7 @@ static int add_dynamic_tcs(sgx_arch_tcs_t* tcs) {
     return ret;
 }
 
-void map_tcs(unsigned int tid) {
+static void map_tcs(unsigned int tid, PAL_HOST_TCB* tcb) {
     while (true) {
         spinlock_lock(&g_enclave_thread_map_lock);
         for (size_t i = 0; i < g_enclave_thread_num; i++) {
@@ -155,6 +215,8 @@ void map_tcs(unsigned int tid) {
                 continue;
             if (!g_enclave_thread_map[i].tid) {
                 g_enclave_thread_map[i].tid = tid;
+                g_enclave_thread_map[i].tcb = tcb;
+
                 pal_get_host_tcb()->tcs = g_enclave_thread_map[i].tcs;
                 ((struct enclave_dbginfo*)DBGINFO_ADDR)->thread_tids[i] = tid;
                 spinlock_unlock(&g_enclave_thread_map_lock);
@@ -186,6 +248,7 @@ void unmap_my_tcs(void) {
     for (i = 0; i < g_enclave_thread_num; i++)
         if (g_enclave_thread_map[i].tcs == pal_get_host_tcb()->tcs) {
             g_enclave_thread_map[i].tid = 0;
+            g_enclave_thread_map[i].tcb = NULL;
             ((struct enclave_dbginfo*)DBGINFO_ADDR)->thread_tids[i] = 0;
             break;
         }
@@ -237,7 +300,7 @@ int pal_thread_init(void* tcbptr) {
     }
 
     int tid = DO_SYSCALL(gettid);
-    map_tcs(tid); /* updates tcb->tcs */
+    map_tcs(tid, tcb); /* also updates tcb->tcs */
 
     if (!tcb->tcs) {
         log_error("There are no available TCS pages left for a new thread. Please try to increase"
diff --git a/pal/src/host/linux-sgx/pal_tcb.h b/pal/src/host/linux-sgx/pal_tcb.h
index 97c4534583..227506db86 100644
--- a/pal/src/host/linux-sgx/pal_tcb.h
+++ b/pal/src/host/linux-sgx/pal_tcb.h
@@ -101,6 +101,7 @@ typedef struct pal_host_tcb {
     uint64_t profile_sample_time;  /* last time sgx_profile_sample() recorded a sample */
     int32_t last_async_event;      /* last async signal, reported to the enclave on ocall return */
     int* start_status_ptr;         /* pointer to return value of clone_thread */
+    bool reset_stats;              /* if true, dump SGX stats and reset them on next AEX/OCALL */
 } PAL_HOST_TCB;
 
 extern void pal_host_tcb_init(PAL_HOST_TCB* tcb, void* stack, void* alt_stack);
@@ -115,5 +116,10 @@ static inline PAL_HOST_TCB* pal_get_host_tcb(void) {
 }
 
 extern bool g_sgx_enable_stats;
-void update_sgx_stats(bool do_print);
+void update_sgx_stats_on_exit(bool do_print);
+void collect_and_print_sgx_stats(void);
+#ifdef DEBUG
+void maybe_dump_and_reset_stats(void);
+#endif /* DEBUG */
+
 #endif /* IN_ENCLAVE */