diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index cd15bf8..2a2b1e5 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -6,6 +6,7 @@ project(tinykvm) option(KVM_EXPERIMENTAL "Enable experimental features" OFF) + if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") set (TINYKVM_ARCH "AMD64" CACHE STRING "TinyKVM Arch Backend") else() @@ -52,7 +53,7 @@ target_link_libraries(tinykvm PUBLIC pthread rt) set_source_files_properties( tinykvm/page_streaming.cpp - PROPERTIES COMPILE_FLAGS -mavx2) + PROPERTIES COMPILE_FLAGS -march=native) if (CMAKE_BUILD_TYPE STREQUAL "Debug") target_compile_options(tinykvm PUBLIC -O0 -ggdb3) diff --git a/lib/tinykvm/page_streaming.cpp b/lib/tinykvm/page_streaming.cpp index c1b8d8a..64571e6 100644 --- a/lib/tinykvm/page_streaming.cpp +++ b/lib/tinykvm/page_streaming.cpp @@ -4,59 +4,23 @@ namespace tinykvm { -#ifdef ENABLE_AVX2_PAGE_UTILS -void page_duplicate(uint64_t* dest, const uint64_t* source) +void avx2_page_duplicate(uint64_t* dest, const uint64_t* source) { -#if defined(__AVX2__) +#if defined(__AVX512F__) for (size_t i = 0; i < 16; i++) { - auto i0 = _mm256_stream_load_si256((__m256i *)&source[4 * 0]); - auto i1 = _mm256_stream_load_si256((__m256i *)&source[4 * 1]); - auto i2 = _mm256_stream_load_si256((__m256i *)&source[4 * 2]); - auto i3 = _mm256_stream_load_si256((__m256i *)&source[4 * 3]); - auto i4 = _mm256_stream_load_si256((__m256i *)&source[4 * 4]); - auto i5 = _mm256_stream_load_si256((__m256i *)&source[4 * 5]); - auto i6 = _mm256_stream_load_si256((__m256i *)&source[4 * 6]); - auto i7 = _mm256_stream_load_si256((__m256i *)&source[4 * 7]); + auto i0 = _mm512_load_si512((__m512i *)&source[8 * 0]); + auto i1 = _mm512_load_si512((__m512i *)&source[8 * 1]); + auto i2 = _mm512_load_si512((__m512i *)&source[8 * 2]); + auto i3 = _mm512_load_si512((__m512i *)&source[8 * 3]); - _mm256_stream_pd((double *)&dest[4 * 0], *(__m256d *) &i0); - _mm256_stream_pd((double *)&dest[4 * 1], *(__m256d *) &i1); - _mm256_stream_pd((double *)&dest[4 * 2], *(__m256d *) &i2); - _mm256_stream_pd((double *)&dest[4 * 3], *(__m256d *) &i3); - _mm256_stream_pd((double *)&dest[4 * 4], *(__m256d *) &i4); - _mm256_stream_pd((double *)&dest[4 * 5], *(__m256d *) &i5); - _mm256_stream_pd((double *)&dest[4 * 6], *(__m256d *) &i6); - _mm256_stream_pd((double *)&dest[4 * 7], *(__m256d *) &i7); - dest += 4 * 8; - source += 4 * 8; + _mm512_store_si512((__m512i *)&dest[8 * 0], i0); + _mm512_store_si512((__m512i *)&dest[8 * 1], i1); + _mm512_store_si512((__m512i *)&dest[8 * 2], i2); + _mm512_store_si512((__m512i *)&dest[8 * 3], i3); + dest += 8 * 4; + source += 8 * 4; } -#else - std::memcpy(dest, source, 4096); -#endif -} - -void page_memzero(uint64_t* dest) -{ -#if defined(__AVX2__) - auto iz = _mm256_setzero_si256(); - for (size_t i = 0; i < 16; i++) { - _mm256_stream_pd((double *)&dest[4 * 0], *(__m256d *) &iz); - _mm256_stream_pd((double *)&dest[4 * 1], *(__m256d *) &iz); - _mm256_stream_pd((double *)&dest[4 * 2], *(__m256d *) &iz); - _mm256_stream_pd((double *)&dest[4 * 3], *(__m256d *) &iz); - _mm256_stream_pd((double *)&dest[4 * 4], *(__m256d *) &iz); - _mm256_stream_pd((double *)&dest[4 * 5], *(__m256d *) &iz); - _mm256_stream_pd((double *)&dest[4 * 6], *(__m256d *) &iz); - _mm256_stream_pd((double *)&dest[4 * 7], *(__m256d *) &iz); - dest += 4 * 8; - } -#else - std::memset(dest, 0, 4096); -#endif -} -#endif - -void avx2_page_duplicate(uint64_t* dest, const uint64_t* source) -{ +#elif defined(__avx2__) for (size_t i = 0; i < 16; i++) { auto i0 = _mm256_load_si256((__m256i *)&source[4 * 0]); auto i1 = _mm256_load_si256((__m256i *)&source[4 * 1]); @@ -78,20 +42,37 @@ void avx2_page_duplicate(uint64_t* dest, const uint64_t* source) dest += 4 * 8; source += 4 * 8; } +#else + std::memcpy(dest, source, 4096); +#endif } -void avx2_page_dupliteit(uint64_t* dest, const uint64_t* source) +void avx2_page_memzero(uint64_t* dest) { +#if defined(__AVX512F__) + auto iz = _mm512_setzero_si512(); for (size_t i = 0; i < 16; i++) { - #pragma unroll(8) - for (int j = 0; j < 8; j++) { - __m256i zmm = _mm256_load_si256((__m256i *)&source[4 * j]); - int is_zero = _mm256_testz_si256(zmm, zmm); - if (is_zero == 0) - _mm256_store_si256((__m256i *)&dest[4 * j], zmm); - } + _mm512_store_si512((__m512i *)&dest[8 * 0], iz); + _mm512_store_si512((__m512i *)&dest[8 * 1], iz); + _mm512_store_si512((__m512i *)&dest[8 * 2], iz); + _mm512_store_si512((__m512i *)&dest[8 * 3], iz); + dest += 8 * 4; + } +#elif defined(__avx2__) + auto iz = _mm256_setzero_si256(); + for (size_t i = 0; i < 16; i++) { + _mm256_store_si256((__m256i *)&dest[4 * 0], iz); + _mm256_store_si256((__m256i *)&dest[4 * 1], iz); + _mm256_store_si256((__m256i *)&dest[4 * 2], iz); + _mm256_store_si256((__m256i *)&dest[4 * 3], iz); + _mm256_store_si256((__m256i *)&dest[4 * 4], iz); + _mm256_store_si256((__m256i *)&dest[4 * 5], iz); + _mm256_store_si256((__m256i *)&dest[4 * 6], iz); + _mm256_store_si256((__m256i *)&dest[4 * 7], iz); dest += 4 * 8; - source += 4 * 8; } +#else + std::memset(dest, 0, 4096); +#endif } } // tinykvm diff --git a/lib/tinykvm/page_streaming.hpp b/lib/tinykvm/page_streaming.hpp index 9f46a06..1517f3f 100644 --- a/lib/tinykvm/page_streaming.hpp +++ b/lib/tinykvm/page_streaming.hpp @@ -1,15 +1,10 @@ #include #include -//#define ENABLE_AVX2_PAGE_UTILS namespace tinykvm { extern void avx2_page_duplicate(uint64_t* dest, const uint64_t* source); - extern void avx2_page_dupliteit(uint64_t* dest, const uint64_t* source); + extern void avx2_page_memzero(uint64_t* dest); -#ifdef ENABLE_AVX2_PAGE_UTILS - extern void page_duplicate(uint64_t* dest, const uint64_t* source); - extern void page_memzero(uint64_t* dest); -#else inline void page_duplicate(uint64_t* dest, const uint64_t* source) { //std::memcpy(dest, source, 4096); @@ -18,8 +13,8 @@ namespace tinykvm { inline void page_memzero(uint64_t* dest) { - std::memset(dest, 0, 4096); + //std::memset(dest, 0, 4096); + avx2_page_memzero(dest); } -#endif }