Skip to content

Commit

Permalink
feat: optimize instruction fetch and decoding with big jump tables
Browse files Browse the repository at this point in the history
  • Loading branch information
edubart committed Dec 9, 2024
1 parent 7cc2524 commit 03aceb3
Show file tree
Hide file tree
Showing 15 changed files with 67,712 additions and 1,286 deletions.
35 changes: 22 additions & 13 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -214,18 +214,27 @@ SHA3_CFLAGS=-O3

# Optimization flags for the interpreter
ifneq (,$(filter yes,$(relwithdebinfo) $(release)))
ifneq (,$(filter gcc,$(CC)))
# The following flag helps GCC to eliminate more redundant computations in the interpret loop,
# saving some host instructions and improving performance.
# This flag is usually enabled by default at -O3,
# but we don't use -O3 because it enables some other flags that are not worth for the interpreter.
INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpredictive-commoning -fsplit-paths -ftree-partial-pre
ifneq (,$(findstring gcc,$(CC)))
# The following improves computed goto dispatch as stated in GCC manual
INTERPRET_CXXFLAGS+=-fno-gcse
# The following remove extra jumps in the computed goto dispatch
INTERPRET_CXXFLAGS+=-fno-crossjumping
# The following remove extra NOPs before jumping back to the interpret hot loop
INTERPRET_CXXFLAGS+=-fno-align-loops
# The interpreter dispatch loop performs better as a big inlined function
INTERPRET_CXXFLAGS+=-finline-limit=1024
# The interpreter hot loop is big and puts pressure on register allocation, this improves register use
INTERPRET_CXXFLAGS+=-frename-registers -fweb
# The interpreter instruction dispatch is big, the following reduces its size minimizing CPU cache pressure
INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple
# Some distributions enables stack protector by default, make sure it's disabled
INTERPRET_CXXFLAGS+=-fno-stack-protector
endif
# Disable jump tables, because it degrades the instruction decoding performance in the interpret loop,
# since it generates a memory indirection that has a high cost in opcode switches.
INTERPRET_CXXFLAGS+=-fno-jump-tables
endif

# Make testing new optimization options easier
INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS)

# Link time optimizations
ifeq ($(lto),yes)
OPTFLAGS+=-flto=auto
Expand Down Expand Up @@ -262,7 +271,7 @@ PGO_WORKLOAD=\
whetstone 25000

LINTER_IGNORE_SOURCES=
LINTER_IGNORE_HEADERS=
LINTER_IGNORE_HEADERS=interpret-jump-table.h
LINTER_SOURCES=$(filter-out $(LINTER_IGNORE_SOURCES),$(strip $(wildcard *.cpp) $(wildcard *.c)))
LINTER_HEADERS=$(filter-out $(LINTER_IGNORE_HEADERS),$(strip $(wildcard *.hpp) $(wildcard *.h)))

Expand All @@ -273,7 +282,7 @@ CLANG_FORMAT=clang-format
CLANG_FORMAT_UARCH_FILES:=$(wildcard ../uarch/*.cpp)
CLANG_FORMAT_UARCH_FILES:=$(filter-out %uarch-printf%,$(strip $(CLANG_FORMAT_UARCH_FILES)))
CLANG_FORMAT_FILES:=$(wildcard *.cpp) $(wildcard *.c) $(wildcard *.h) $(wildcard *.hpp) $(CLANG_FORMAT_UARCH_FILES)
CLANG_FORMAT_IGNORE_FILES:=
CLANG_FORMAT_IGNORE_FILES:=interpret-jump-table.h
CLANG_FORMAT_FILES:=$(strip $(CLANG_FORMAT_FILES))
CLANG_FORMAT_FILES:=$(filter-out $(CLANG_FORMAT_IGNORE_FILES),$(strip $(CLANG_FORMAT_FILES)))

Expand Down Expand Up @@ -541,12 +550,12 @@ jsonrpc-discover.cpp: jsonrpc-discover.json
echo '} // namespace cartesi' >> jsonrpc-discover.cpp

%.clang-tidy: %.cpp machine-c-version.h
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) 2>/dev/null
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) -DCLANG_TIDY_LINT 2>/dev/null
@$(CXX) $(CXXFLAGS) $(LUA_INC) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1
@touch $@

%.clang-tidy: %.c
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) 2>/dev/null
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) -DCLANG_TIDY_LINT 2>/dev/null
@$(CC) $(CFLAGS) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1
@touch $@

Expand Down
4 changes: 2 additions & 2 deletions src/device-state-access.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace cartesi {
template <typename STATE_ACCESS>
class device_state_access : public i_device_state_access {
public:
explicit device_state_access(STATE_ACCESS &a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
explicit device_state_access(STATE_ACCESS a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
static_assert(is_an_i_state_access<STATE_ACCESS>::value, "not an i_state_access");
}

Expand All @@ -52,7 +52,7 @@ class device_state_access : public i_device_state_access {
~device_state_access() override = default;

private:
STATE_ACCESS &m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
STATE_ACCESS m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
uint64_t m_mcycle;

void do_set_mip(uint64_t mask) override {
Expand Down
5 changes: 3 additions & 2 deletions src/i-state-access.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <type_traits>
#include <utility>

#include "compiler-defines.h"
#include "meta.h"
#include "shadow-tlb.h"

Expand Down Expand Up @@ -773,15 +774,15 @@ class i_state_access { // CRTP
}

/// \brief Invalidates all TLB entries of all types.
void flush_all_tlb() {
NO_INLINE void flush_all_tlb() {
derived().template flush_tlb_type<TLB_CODE>();
derived().template flush_tlb_type<TLB_READ>();
derived().template flush_tlb_type<TLB_WRITE>();
}

/// \brief Invalidates TLB entries for a specific virtual address.
/// \param vaddr Target virtual address.
void flush_tlb_vaddr(uint64_t vaddr) {
NO_INLINE void flush_tlb_vaddr(uint64_t vaddr) {
return derived().do_flush_tlb_vaddr(vaddr);
}

Expand Down
Loading

0 comments on commit 03aceb3

Please sign in to comment.