From d0c601aa3041e2091a80bba205f78d578127892e Mon Sep 17 00:00:00 2001 From: Eduardo Bart Date: Tue, 17 Dec 2024 19:21:58 -0300 Subject: [PATCH] feat: optimize uarch reset to speed up CI testing --- src/compiler-defines.h | 6 +++++ src/is-pristine.h | 43 +++++++++++++++++++++++++++++++++ src/machine.cpp | 6 ++--- src/pma.cpp | 40 +++++++++++++++++++++++++++--- src/uarch-record-state-access.h | 3 ++- src/uarch-state-access.h | 3 ++- uarch/Makefile | 1 + 7 files changed, 92 insertions(+), 10 deletions(-) create mode 100644 src/is-pristine.h diff --git a/src/compiler-defines.h b/src/compiler-defines.h index 0fb7eb004..b2e331db9 100644 --- a/src/compiler-defines.h +++ b/src/compiler-defines.h @@ -40,4 +40,10 @@ #define PACKED __attribute__((packed)) +#if defined(__GNUC__) +#define FORCE_OPTIMIZE_O3 __attribute__((optimize("-O3"))) +#else +#define FORCE_OPTIMIZE_O3 +#endif + #endif diff --git a/src/is-pristine.h b/src/is-pristine.h new file mode 100644 index 000000000..458d297fc --- /dev/null +++ b/src/is-pristine.h @@ -0,0 +1,43 @@ +// Copyright Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: LGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) any +// later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +// PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License along +// with this program (see COPYING). If not, see . +// + +#ifndef IS_PRISTINE_H +#define IS_PRISTINE_H + +#include "compiler-defines.h" +#include +#include + +namespace cartesi { + +/// \brief This is an optimized function for checking if memory page is pristine. +/// \param data Memory pointer +/// \param length Memory length +/// \details It's instead to be used in situations where length is equal or less than a page size. +// NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) +static inline bool FORCE_OPTIMIZE_O3 is_pristine(const unsigned char *data, size_t length) { + // This tight for loop has no branches, and is optimized to SIMD instructions in x86_64, + // making it very fast to check if a given page is pristine. + unsigned char bits = 0; + for (size_t i = 0; i < length; ++i) { + bits |= data[i]; + } + return bits == 0; +} + +} // namespace cartesi + +#endif \ No newline at end of file diff --git a/src/machine.cpp b/src/machine.cpp index 28836c885..cb34b4c52 100644 --- a/src/machine.cpp +++ b/src/machine.cpp @@ -43,6 +43,7 @@ #include "htif.h" #include "i-device-state-access.h" #include "interpret.h" +#include "is-pristine.h" #include "machine-config.h" #include "machine-memory-range-descr.h" #include "machine-runtime-config.h" @@ -2031,10 +2032,7 @@ bool machine::update_merkle_tree() const { return false; } if (page_data != nullptr) { - const bool is_pristine = std::all_of(page_data, page_data + PMA_PAGE_SIZE, - [](unsigned char pp) -> bool { return pp == '\0'; }); - - if (is_pristine) { + if (is_pristine(page_data, PMA_PAGE_SIZE)) { // The update_page_node_hash function in the machine_merkle_tree is not thread // safe, so we protect it with a mutex const parallel_for_mutex_guard lock(mutex); diff --git a/src/pma.cpp b/src/pma.cpp index cde8d8462..45d29a5e5 100644 --- a/src/pma.cpp +++ b/src/pma.cpp @@ -26,6 +26,7 @@ #include #include +#include "is-pristine.h" #include "os.h" #include "pma-constants.h" #include "pma-driver.h" @@ -162,8 +163,24 @@ void pma_entry::write_memory(uint64_t paddr, const unsigned char *data, uint64_t if (data == nullptr) { throw std::invalid_argument{"invalid data buffer"}; } - memcpy(get_memory().get_host_memory() + (paddr - get_start()), data, size); - mark_dirty_pages(paddr, size); + // The case of writing a large range chunk is special and optimized for uarch reset + if (size > PMA_PAGE_SIZE) { + // Copy in chunks of page size, to avoid marking dirty pages unnecessarily + for (uint64_t offset = 0; offset < size; offset += PMA_PAGE_SIZE) { + const uint64_t paddr_offset = paddr + offset; + const uint64_t chunk_len = std::min(PMA_PAGE_SIZE, size - offset); + const unsigned char *src = data + offset; + unsigned char *dest = get_memory().get_host_memory() + (paddr_offset - get_start()); + if (memcmp(dest, src, chunk_len) != 0) { + // Page is different, we have to copy memory + memcpy(dest, src, chunk_len); + mark_dirty_pages(paddr + offset, chunk_len); + } + } + } else { + memcpy(get_memory().get_host_memory() + (paddr - get_start()), data, size); + mark_dirty_pages(paddr, size); + } } void pma_entry::fill_memory(uint64_t paddr, unsigned char value, uint64_t size) { @@ -173,8 +190,23 @@ void pma_entry::fill_memory(uint64_t paddr, unsigned char value, uint64_t size) if (!contains(paddr, size)) { throw std::invalid_argument{"range not contained in pma"}; } - memset(get_memory().get_host_memory() + (paddr - get_start()), value, size); - mark_dirty_pages(paddr, size); + // The case of filling a large range with zeros is special and optimized for uarch reset + if (value == 0 && size > PMA_PAGE_SIZE) { + // Fill in chunks of page size, to avoid marking dirty pages unnecessarily + for (uint64_t offset = 0; offset < size; offset += PMA_PAGE_SIZE) { + const uint64_t paddr_offset = paddr + offset; + const uint64_t chunk_len = std::min(PMA_PAGE_SIZE, size - offset); + unsigned char *dest = get_memory().get_host_memory() + (paddr_offset - get_start()); + if (!is_pristine(dest, chunk_len)) { + // Page is different, we have to fill memory + memset(dest, 0, chunk_len); + mark_dirty_pages(paddr + offset, chunk_len); + } + } + } else { + memset(get_memory().get_host_memory() + (paddr - get_start()), value, size); + mark_dirty_pages(paddr, size); + } } bool pma_peek_error(const pma_entry & /*pma*/, const machine & /*m*/, uint64_t /*page_address*/, diff --git a/src/uarch-record-state-access.h b/src/uarch-record-state-access.h index 4b664c5f2..18122fd9a 100644 --- a/src/uarch-record-state-access.h +++ b/src/uarch-record-state-access.h @@ -449,8 +449,9 @@ class uarch_record_state_access : public i_uarch_state_accessget_log_type().has_large_data()) { // log written data, if debug info is enabled a.get_written().emplace(get_uarch_state_image()); diff --git a/src/uarch-state-access.h b/src/uarch-state-access.h index 02d8e999e..c0578f2e6 100644 --- a/src/uarch-state-access.h +++ b/src/uarch-state-access.h @@ -195,8 +195,9 @@ class uarch_state_access : public i_uarch_state_access { if (uarch_pristine_ram_len > m_us.ram.get_length()) { throw std::runtime_error("embedded uarch ram image does not fit in uarch ram pma"); } - m_us.ram.fill_memory(m_us.ram.get_start(), 0, m_us.ram.get_length()); m_us.ram.write_memory(m_us.ram.get_start(), uarch_pristine_ram, uarch_pristine_ram_len); + m_us.ram.fill_memory(m_us.ram.get_start() + uarch_pristine_ram_len, 0, + m_us.ram.get_length() - uarch_pristine_ram_len); } }; diff --git a/uarch/Makefile b/uarch/Makefile index cd80e391c..41a8fe619 100644 --- a/uarch/Makefile +++ b/uarch/Makefile @@ -43,6 +43,7 @@ WARNFLAGS := -Wall -Wextra -Wpedantic -Wno-array-bounds -Werror CFLAGS := -march=rv64i -mabi=lp64 -Wl,--gc-sections $(OPTFLAGS) $(UBFLAGS) $(WARNFLAGS) \ -DMICROARCHITECTURE=1 \ + -DNO_COMPUTED_GOTO \ -DAVOID_NATIVE_UINT128_T=1 \ -ffreestanding \ -nostartfiles \