Skip to content

Commit

Permalink
Add GitHub Actions CI for Mac
Browse files Browse the repository at this point in the history
  • Loading branch information
Maratyszcza committed May 27, 2024
1 parent 680f06d commit 8d5f685
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 98 deletions.
24 changes: 23 additions & 1 deletion .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,30 @@ jobs:
- name: Install ninja
run: sudo apt install ninja-build
- name: Configure
run: cmake -Bbuild -S. -GNinja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON
run: cmake -Bbuild -S. -G Ninja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON
- name: Build
run: cmake --build build
- name: Test
run: ctest --test-dir build
cmake-macos-x86_64:
runs-on: macos-12
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
- name: Configure
run: cmake -Bbuild -S. -G Xcode -DCMAKE_CONFIGURATION_TYPES=Release -DHAVE_STD_REGEX=TRUE -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON -DCMAKE_OSX_ARCHITECTURES=x86_64
- name: Build
run: cmake --build build --config Release --parallel $(sysctl -n hw.ncpu) -- -quiet
- name: Test
run: ctest --test-dir build
cmake-macos-arm64:
runs-on: macos-14
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
- name: Configure
run: cmake -Bbuild -S. -G Xcode -DCMAKE_CONFIGURATION_TYPES=Release -DHAVE_STD_REGEX=TRUE -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON -DCMAKE_OSX_ARCHITECTURES=arm64e
- name: Build
run: cmake --build build --config Release --parallel $(sysctl -n hw.ncpu) -- -quiet
- name: Test
run: ctest --test-dir build
45 changes: 0 additions & 45 deletions bench/from-alt-array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,49 +63,4 @@ static void fp16_alt_to_fp32_value(benchmark::State& state) {
}
BENCHMARK(fp16_alt_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);

#if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__)
static void hardware_vcvt_f32_f16(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));

std::vector<uint16_t> fp16(state.range(0));
std::vector<float> fp32(state.range(0));
std::generate(fp16.begin(), fp16.end(),
[&rng]{ return fp16_ieee_from_fp32_value(rng()); });

while (state.KeepRunning()) {
uint16_t* input = fp16.data();
benchmark::DoNotOptimize(input);

float* output = fp32.data();
const size_t n = state.range(0);
#if defined(__aarch64__)
const unsigned int fpcr = __builtin_aarch64_get_fpcr();
/* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */
__builtin_aarch64_set_fpcr((fpcr & 0xFEFFFFFFu) | 0x08000000u);
#else
unsigned int fpscr;
__asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
/* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :
: [fpscr] "r" ((fpscr & 0xFEFFFFFFu) | 0x08000000u));
#endif
for (size_t i = 0; i < n; i += 4) {
vst1q_f32(&output[i],
vcvt_f32_f16(
(float16x4_t) vld1_u16(&input[i])));
}
#if defined(__aarch64__)
__builtin_aarch64_set_fpcr(fpcr);
#else
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
#endif

benchmark::DoNotOptimize(output);
}
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
}
BENCHMARK(hardware_vcvt_f32_f16)->RangeMultiplier(2)->Range(1<<10, 64<<20);
#endif

BENCHMARK_MAIN();
8 changes: 5 additions & 3 deletions bench/from-ieee-array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,11 @@ BENCHMARK(fp16_ieee_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
float* output = fp32.data();
const size_t n = state.range(0);
#if defined(__aarch64__)
const unsigned int fpcr = __builtin_aarch64_get_fpcr();
unsigned int fpcr;
__asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (fpcr));
/* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
__builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu);
__asm__ __volatile__("MSR fpcr, %[fpcr]" :
: [fpcr] "r" (fpcr & 0xF6FFFFFFu));
#else
unsigned int fpscr;
__asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
Expand All @@ -164,7 +166,7 @@ BENCHMARK(fp16_ieee_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
(float16x4_t) vld1_u16(&input[i])));
}
#if defined(__aarch64__)
__builtin_aarch64_set_fpcr(fpcr);
__asm__ __volatile__("MSR fpcr, %[fpcr]" :: [fpcr] "r" (fpcr));
#else
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
#endif
Expand Down
44 changes: 0 additions & 44 deletions bench/to-alt-array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,48 +41,4 @@ static void fp16_alt_from_fp32_value(benchmark::State& state) {
}
BENCHMARK(fp16_alt_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);

#if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__)
static void hardware_vcvt_f16_f32(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));

std::vector<float> fp32(state.range(0));
std::vector<uint16_t> fp16(state.range(0));
std::generate(fp32.begin(), fp32.end(), std::ref(rng));

while (state.KeepRunning()) {
float* input = fp32.data();
benchmark::DoNotOptimize(input);

uint16_t* output = fp16.data();
const size_t n = state.range(0);
#if defined(__aarch64__)
const unsigned int fpcr = __builtin_aarch64_get_fpcr();
/* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */
__builtin_aarch64_set_fpcr((fpcr & 0xFEFFFFFFu) | 0x08000000u);
#else
unsigned int fpscr;
__asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
/* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :
: [fpscr] "r" ((fpscr & 0xFEFFFFFFu) | 0x08000000u));
#endif
for (size_t i = 0; i < n; i += 4) {
vst1_u16(&output[i],
(uint16x4_t) vcvt_f16_f32(
vld1q_f32(&input[i])));
}
#if defined(__aarch64__)
__builtin_aarch64_set_fpcr(fpcr);
#else
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
#endif

benchmark::DoNotOptimize(output);
}
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
}
BENCHMARK(hardware_vcvt_f16_f32)->RangeMultiplier(2)->Range(1<<10, 64<<20);
#endif

BENCHMARK_MAIN();
8 changes: 5 additions & 3 deletions bench/to-ieee-array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,11 @@ BENCHMARK(fp16_ieee_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
uint16_t* output = fp16.data();
const size_t n = state.range(0);
#if defined(__aarch64__)
const unsigned int fpcr = __builtin_aarch64_get_fpcr();
unsigned int fpcr;
__asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (fpcr));
/* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
__builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu);
__asm__ __volatile__("MSR fpcr, %[fpcr]" :
: [fpcr] "r" (fpcr & 0xF6FFFFFFu));
#else
unsigned int fpscr;
__asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
Expand All @@ -135,7 +137,7 @@ BENCHMARK(fp16_ieee_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
vld1q_f32(&input[i])));
}
#if defined(__aarch64__)
__builtin_aarch64_set_fpcr(fpcr);
__asm__ __volatile__("MSR fpcr, %[fpcr]" :: [fpcr] "r" (fpcr));
#else
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
#endif
Expand Down
4 changes: 2 additions & 2 deletions cmake/DownloadGoogleBenchmark.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ PROJECT(googlebenchmark-download NONE)

INCLUDE(ExternalProject)
ExternalProject_Add(googlebenchmark
URL https://github.com/google/benchmark/archive/v1.2.0.zip
URL_HASH SHA256=cc463b28cb3701a35c0855fbcefb75b29068443f1952b64dd5f4f669272e95ea
URL https://github.com/google/benchmark/archive/refs/tags/v1.8.4.zip
URL_HASH SHA256=84c49c4c07074f36fbf8b4f182ed7d75191a6fa72756ab4a17848455499f4286
SOURCE_DIR "${CMAKE_BINARY_DIR}/googlebenchmark-source"
BINARY_DIR "${CMAKE_BINARY_DIR}/googlebenchmark"
CONFIGURE_COMMAND ""
Expand Down

0 comments on commit 8d5f685

Please sign in to comment.