Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ project(tinykvm)

option(KVM_EXPERIMENTAL "Enable experimental features" OFF)


if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set (TINYKVM_ARCH "AMD64" CACHE STRING "TinyKVM Arch Backend")
else()
Expand Down Expand Up @@ -52,7 +53,7 @@ target_link_libraries(tinykvm PUBLIC pthread rt)

set_source_files_properties(
tinykvm/page_streaming.cpp
PROPERTIES COMPILE_FLAGS -mavx2)
PROPERTIES COMPILE_FLAGS -march=native)

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
target_compile_options(tinykvm PUBLIC -O0 -ggdb3)
Expand Down
97 changes: 39 additions & 58 deletions lib/tinykvm/page_streaming.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,59 +4,23 @@

namespace tinykvm {

#ifdef ENABLE_AVX2_PAGE_UTILS
void page_duplicate(uint64_t* dest, const uint64_t* source)
void avx2_page_duplicate(uint64_t* dest, const uint64_t* source)
{
#if defined(__AVX2__)
#if defined(__AVX512F__)
for (size_t i = 0; i < 16; i++) {
auto i0 = _mm256_stream_load_si256((__m256i *)&source[4 * 0]);
auto i1 = _mm256_stream_load_si256((__m256i *)&source[4 * 1]);
auto i2 = _mm256_stream_load_si256((__m256i *)&source[4 * 2]);
auto i3 = _mm256_stream_load_si256((__m256i *)&source[4 * 3]);
auto i4 = _mm256_stream_load_si256((__m256i *)&source[4 * 4]);
auto i5 = _mm256_stream_load_si256((__m256i *)&source[4 * 5]);
auto i6 = _mm256_stream_load_si256((__m256i *)&source[4 * 6]);
auto i7 = _mm256_stream_load_si256((__m256i *)&source[4 * 7]);
auto i0 = _mm512_load_si512((__m512i *)&source[8 * 0]);
auto i1 = _mm512_load_si512((__m512i *)&source[8 * 1]);
auto i2 = _mm512_load_si512((__m512i *)&source[8 * 2]);
auto i3 = _mm512_load_si512((__m512i *)&source[8 * 3]);

_mm256_stream_pd((double *)&dest[4 * 0], *(__m256d *) &i0);
_mm256_stream_pd((double *)&dest[4 * 1], *(__m256d *) &i1);
_mm256_stream_pd((double *)&dest[4 * 2], *(__m256d *) &i2);
_mm256_stream_pd((double *)&dest[4 * 3], *(__m256d *) &i3);
_mm256_stream_pd((double *)&dest[4 * 4], *(__m256d *) &i4);
_mm256_stream_pd((double *)&dest[4 * 5], *(__m256d *) &i5);
_mm256_stream_pd((double *)&dest[4 * 6], *(__m256d *) &i6);
_mm256_stream_pd((double *)&dest[4 * 7], *(__m256d *) &i7);
dest += 4 * 8;
source += 4 * 8;
_mm512_store_si512((__m512i *)&dest[8 * 0], i0);
_mm512_store_si512((__m512i *)&dest[8 * 1], i1);
_mm512_store_si512((__m512i *)&dest[8 * 2], i2);
_mm512_store_si512((__m512i *)&dest[8 * 3], i3);
dest += 8 * 4;
source += 8 * 4;
}
#else
std::memcpy(dest, source, 4096);
#endif
}

void page_memzero(uint64_t* dest)
{
#if defined(__AVX2__)
auto iz = _mm256_setzero_si256();
for (size_t i = 0; i < 16; i++) {
_mm256_stream_pd((double *)&dest[4 * 0], *(__m256d *) &iz);
_mm256_stream_pd((double *)&dest[4 * 1], *(__m256d *) &iz);
_mm256_stream_pd((double *)&dest[4 * 2], *(__m256d *) &iz);
_mm256_stream_pd((double *)&dest[4 * 3], *(__m256d *) &iz);
_mm256_stream_pd((double *)&dest[4 * 4], *(__m256d *) &iz);
_mm256_stream_pd((double *)&dest[4 * 5], *(__m256d *) &iz);
_mm256_stream_pd((double *)&dest[4 * 6], *(__m256d *) &iz);
_mm256_stream_pd((double *)&dest[4 * 7], *(__m256d *) &iz);
dest += 4 * 8;
}
#else
std::memset(dest, 0, 4096);
#endif
}
#endif

void avx2_page_duplicate(uint64_t* dest, const uint64_t* source)
{
#elif defined(__avx2__)
for (size_t i = 0; i < 16; i++) {
auto i0 = _mm256_load_si256((__m256i *)&source[4 * 0]);
auto i1 = _mm256_load_si256((__m256i *)&source[4 * 1]);
Expand All @@ -78,20 +42,37 @@ void avx2_page_duplicate(uint64_t* dest, const uint64_t* source)
dest += 4 * 8;
source += 4 * 8;
}
#else
std::memcpy(dest, source, 4096);
#endif
}
void avx2_page_dupliteit(uint64_t* dest, const uint64_t* source)
void avx2_page_memzero(uint64_t* dest)
{
#if defined(__AVX512F__)
auto iz = _mm512_setzero_si512();
for (size_t i = 0; i < 16; i++) {
#pragma unroll(8)
for (int j = 0; j < 8; j++) {
__m256i zmm = _mm256_load_si256((__m256i *)&source[4 * j]);
int is_zero = _mm256_testz_si256(zmm, zmm);
if (is_zero == 0)
_mm256_store_si256((__m256i *)&dest[4 * j], zmm);
}
_mm512_store_si512((__m512i *)&dest[8 * 0], iz);
_mm512_store_si512((__m512i *)&dest[8 * 1], iz);
_mm512_store_si512((__m512i *)&dest[8 * 2], iz);
_mm512_store_si512((__m512i *)&dest[8 * 3], iz);
dest += 8 * 4;
}
#elif defined(__avx2__)
auto iz = _mm256_setzero_si256();
for (size_t i = 0; i < 16; i++) {
_mm256_store_si256((__m256i *)&dest[4 * 0], iz);
_mm256_store_si256((__m256i *)&dest[4 * 1], iz);
_mm256_store_si256((__m256i *)&dest[4 * 2], iz);
_mm256_store_si256((__m256i *)&dest[4 * 3], iz);
_mm256_store_si256((__m256i *)&dest[4 * 4], iz);
_mm256_store_si256((__m256i *)&dest[4 * 5], iz);
_mm256_store_si256((__m256i *)&dest[4 * 6], iz);
_mm256_store_si256((__m256i *)&dest[4 * 7], iz);
dest += 4 * 8;
source += 4 * 8;
}
#else
std::memset(dest, 0, 4096);
#endif
}

} // tinykvm
11 changes: 3 additions & 8 deletions lib/tinykvm/page_streaming.hpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
#include <cstdint>
#include <cstring>
//#define ENABLE_AVX2_PAGE_UTILS

namespace tinykvm {
extern void avx2_page_duplicate(uint64_t* dest, const uint64_t* source);
extern void avx2_page_dupliteit(uint64_t* dest, const uint64_t* source);
extern void avx2_page_memzero(uint64_t* dest);

#ifdef ENABLE_AVX2_PAGE_UTILS
extern void page_duplicate(uint64_t* dest, const uint64_t* source);
extern void page_memzero(uint64_t* dest);
#else
inline void page_duplicate(uint64_t* dest, const uint64_t* source)
{
//std::memcpy(dest, source, 4096);
Expand All @@ -18,8 +13,8 @@ namespace tinykvm {

inline void page_memzero(uint64_t* dest)
{
std::memset(dest, 0, 4096);
//std::memset(dest, 0, 4096);
avx2_page_memzero(dest);
}
#endif

}