diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 7242d6e..4bbc045 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -21,8 +21,30 @@ jobs: - name: Install ninja run: sudo apt install ninja-build - name: Configure - run: cmake -Bbuild -S. -GNinja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON + run: cmake -Bbuild -S. -G Ninja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON - name: Build run: cmake --build build - name: Test run: ctest --test-dir build + cmake-macos-x86_64: + runs-on: macos-12 + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + - name: Configure + run: cmake -Bbuild -S. -G Xcode -DCMAKE_CONFIGURATION_TYPES=Release -DHAVE_STD_REGEX=TRUE -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON -DCMAKE_OSX_ARCHITECTURES=x86_64 + - name: Build + run: cmake --build build --config Release --parallel $(sysctl -n hw.ncpu) -- -quiet + - name: Test + run: ctest --test-dir build + cmake-macos-arm64: + runs-on: macos-14 + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + - name: Configure + run: cmake -Bbuild -S. -G Xcode -DCMAKE_CONFIGURATION_TYPES=Release -DHAVE_STD_REGEX=TRUE -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON -DCMAKE_OSX_ARCHITECTURES=arm64e + - name: Build + run: cmake --build build --config Release --parallel $(sysctl -n hw.ncpu) -- -quiet + - name: Test + run: ctest --test-dir build diff --git a/bench/from-alt-array.cc b/bench/from-alt-array.cc index 17fc84e..d9d807e 100644 --- a/bench/from-alt-array.cc +++ b/bench/from-alt-array.cc @@ -63,49 +63,4 @@ static void fp16_alt_to_fp32_value(benchmark::State& state) { } BENCHMARK(fp16_alt_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); -#if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__) - static void hardware_vcvt_f32_f16(benchmark::State& state) { - const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); - auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); - - std::vector fp16(state.range(0)); - std::vector fp32(state.range(0)); - std::generate(fp16.begin(), fp16.end(), - [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); - - while (state.KeepRunning()) { - uint16_t* input = fp16.data(); - benchmark::DoNotOptimize(input); - - float* output = fp32.data(); - const size_t n = state.range(0); - #if defined(__aarch64__) - const unsigned int fpcr = __builtin_aarch64_get_fpcr(); - /* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */ - __builtin_aarch64_set_fpcr((fpcr & 0xFEFFFFFFu) | 0x08000000u); - #else - unsigned int fpscr; - __asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr)); - /* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */ - __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" : - : [fpscr] "r" ((fpscr & 0xFEFFFFFFu) | 0x08000000u)); - #endif - for (size_t i = 0; i < n; i += 4) { - vst1q_f32(&output[i], - vcvt_f32_f16( - (float16x4_t) vld1_u16(&input[i]))); - } - #if defined(__aarch64__) - __builtin_aarch64_set_fpcr(fpcr); - #else - __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr)); - #endif - - benchmark::DoNotOptimize(output); - } - state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); - } - BENCHMARK(hardware_vcvt_f32_f16)->RangeMultiplier(2)->Range(1<<10, 64<<20); -#endif - BENCHMARK_MAIN(); diff --git a/bench/from-ieee-array.cc b/bench/from-ieee-array.cc index 67a11b1..d14f471 100644 --- a/bench/from-ieee-array.cc +++ b/bench/from-ieee-array.cc @@ -148,9 +148,11 @@ BENCHMARK(fp16_ieee_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); float* output = fp32.data(); const size_t n = state.range(0); #if defined(__aarch64__) - const unsigned int fpcr = __builtin_aarch64_get_fpcr(); + unsigned int fpcr; + __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (fpcr)); /* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */ - __builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu); + __asm__ __volatile__("MSR fpcr, %[fpcr]" : + : [fpcr] "r" (fpcr & 0xF6FFFFFFu)); #else unsigned int fpscr; __asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr)); @@ -164,7 +166,7 @@ BENCHMARK(fp16_ieee_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); (float16x4_t) vld1_u16(&input[i]))); } #if defined(__aarch64__) - __builtin_aarch64_set_fpcr(fpcr); + __asm__ __volatile__("MSR fpcr, %[fpcr]" :: [fpcr] "r" (fpcr)); #else __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr)); #endif diff --git a/bench/to-alt-array.cc b/bench/to-alt-array.cc index 9f56db8..f4bea74 100644 --- a/bench/to-alt-array.cc +++ b/bench/to-alt-array.cc @@ -41,48 +41,4 @@ static void fp16_alt_from_fp32_value(benchmark::State& state) { } BENCHMARK(fp16_alt_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); -#if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__) - static void hardware_vcvt_f16_f32(benchmark::State& state) { - const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); - auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); - - std::vector fp32(state.range(0)); - std::vector fp16(state.range(0)); - std::generate(fp32.begin(), fp32.end(), std::ref(rng)); - - while (state.KeepRunning()) { - float* input = fp32.data(); - benchmark::DoNotOptimize(input); - - uint16_t* output = fp16.data(); - const size_t n = state.range(0); - #if defined(__aarch64__) - const unsigned int fpcr = __builtin_aarch64_get_fpcr(); - /* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */ - __builtin_aarch64_set_fpcr((fpcr & 0xFEFFFFFFu) | 0x08000000u); - #else - unsigned int fpscr; - __asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr)); - /* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */ - __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" : - : [fpscr] "r" ((fpscr & 0xFEFFFFFFu) | 0x08000000u)); - #endif - for (size_t i = 0; i < n; i += 4) { - vst1_u16(&output[i], - (uint16x4_t) vcvt_f16_f32( - vld1q_f32(&input[i]))); - } - #if defined(__aarch64__) - __builtin_aarch64_set_fpcr(fpcr); - #else - __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr)); - #endif - - benchmark::DoNotOptimize(output); - } - state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); - } - BENCHMARK(hardware_vcvt_f16_f32)->RangeMultiplier(2)->Range(1<<10, 64<<20); -#endif - BENCHMARK_MAIN(); diff --git a/bench/to-ieee-array.cc b/bench/to-ieee-array.cc index ffdb77f..a3e3120 100644 --- a/bench/to-ieee-array.cc +++ b/bench/to-ieee-array.cc @@ -119,9 +119,11 @@ BENCHMARK(fp16_ieee_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); uint16_t* output = fp16.data(); const size_t n = state.range(0); #if defined(__aarch64__) - const unsigned int fpcr = __builtin_aarch64_get_fpcr(); + unsigned int fpcr; + __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (fpcr)); /* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */ - __builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu); + __asm__ __volatile__("MSR fpcr, %[fpcr]" : + : [fpcr] "r" (fpcr & 0xF6FFFFFFu)); #else unsigned int fpscr; __asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr)); @@ -135,7 +137,7 @@ BENCHMARK(fp16_ieee_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); vld1q_f32(&input[i]))); } #if defined(__aarch64__) - __builtin_aarch64_set_fpcr(fpcr); + __asm__ __volatile__("MSR fpcr, %[fpcr]" :: [fpcr] "r" (fpcr)); #else __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr)); #endif diff --git a/cmake/DownloadGoogleBenchmark.cmake b/cmake/DownloadGoogleBenchmark.cmake index 0b082ba..46e5ea0 100644 --- a/cmake/DownloadGoogleBenchmark.cmake +++ b/cmake/DownloadGoogleBenchmark.cmake @@ -4,8 +4,8 @@ PROJECT(googlebenchmark-download NONE) INCLUDE(ExternalProject) ExternalProject_Add(googlebenchmark - URL https://github.com/google/benchmark/archive/v1.2.0.zip - URL_HASH SHA256=cc463b28cb3701a35c0855fbcefb75b29068443f1952b64dd5f4f669272e95ea + URL https://github.com/google/benchmark/archive/refs/tags/v1.8.4.zip + URL_HASH SHA256=84c49c4c07074f36fbf8b4f182ed7d75191a6fa72756ab4a17848455499f4286 SOURCE_DIR "${CMAKE_BINARY_DIR}/googlebenchmark-source" BINARY_DIR "${CMAKE_BINARY_DIR}/googlebenchmark" CONFIGURE_COMMAND ""