From 88403a9b498cd246e64e473b81e2ffcc5da1471d Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Wed, 19 Jun 2024 23:11:08 -0700
Subject: [PATCH] Support native conversions without __fp16/_Float16 types

---
 .github/workflows/cmake.yml | 49 +++++++++++++++++++++
 include/fp16/fp16.h         | 86 +++++++++++++++++++++++++------------
 include/fp16/macros.h       | 14 ++++++
 3 files changed, 121 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 8878700..7bf724d 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -65,6 +65,27 @@ jobs:
         run: cmake --build build --parallel
       - name: Test
         run: ctest --test-dir build --parallel --output-on-failure
+  cmake-linux-x86-f16c:
+    runs-on: ubuntu-20.04
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+      - name: Update apt
+        run: sudo apt update
+      - name: Install multilib gcc
+        run: sudo apt install gcc-multilib g++-multilib
+      - name: Install ninja
+        run: sudo apt install ninja-build
+      - name: Configure
+        run: cmake -Bbuild -S. -G Ninja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON
+        env:
+          CFLAGS: "-m32 -mf16c"
+          CXXFLAGS: "-m32 -mf16c"
+          LDFLAGS: "-m32"
+      - name: Build
+        run: cmake --build build --parallel
+      - name: Test
+        run: ctest --test-dir build --parallel --output-on-failure
   cmake-macos-x86_64:
     runs-on: macos-12
     timeout-minutes: 15
@@ -115,6 +136,20 @@ jobs:
         run: cmake --build build --config Release --parallel
       - name: Test
         run: ctest --test-dir build --build-config Release --parallel --output-on-failure
+  cmake-windows-x86-avx2:
+    runs-on: windows-2019
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+      - name: Configure
+        run: cmake -Bbuild -S. -G "Visual Studio 16 2019" -A Win32 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON
+        env:
+          CFLAGS: "/arch:AVX2"
+          CXXFLAGS: "/arch:AVX2"
+      - name: Build
+        run: cmake --build build --config Release --parallel
+      - name: Test
+        run: ctest --test-dir build --build-config Release --parallel --output-on-failure
   cmake-windows-x64:
     runs-on: windows-2019
     timeout-minutes: 15
@@ -126,6 +161,20 @@ jobs:
         run: cmake --build build --config Release --parallel
       - name: Test
         run: ctest --test-dir build --build-config Release --parallel --output-on-failure
+  cmake-windows-x64-avx2:
+    runs-on: windows-2019
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+      - name: Configure
+        run: cmake -Bbuild -S. -G "Visual Studio 16 2019" -A x64 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON
+        env:
+          CFLAGS: "/arch:AVX2"
+          CXXFLAGS: "/arch:AVX2"
+      - name: Build
+        run: cmake --build build --config Release --parallel
+      - name: Test
+        run: ctest --test-dir build --build-config Release --parallel --output-on-failure
   cmake-windows-arm64:
     runs-on: windows-2019
     timeout-minutes: 15
diff --git a/include/fp16/fp16.h b/include/fp16/fp16.h
index 0bcf61b..e87cf52 100644
--- a/include/fp16/fp16.h
+++ b/include/fp16/fp16.h
@@ -10,13 +10,19 @@
 	#include <math.h>
 #endif
 
-#ifdef _MSC_VER
-	#include <intrin.h>
-#endif
-
 #include <fp16/bitcasts.h>
 #include <fp16/macros.h>
 
+#if defined(_MSC_VER)
+	#include <intrin.h>
+#endif
+#if defined(__F16C__) && FP16_USE_NATIVE_CONVERSION && !FP16_USE_FLOAT16_TYPE && !FP16_USE_FP16_TYPE
+	#include <immintrin.h>
+#endif
+#if (defined(__aarch64__) || defined(_M_ARM64)) && FP16_USE_NATIVE_CONVERSION && !FP16_USE_FLOAT16_TYPE && !FP16_USE_FP16_TYPE
+	#include <arm_neon.h>
+#endif
+
 
 /*
  * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
@@ -107,18 +113,30 @@ static inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
  * floating-point operations and bitcasts between integer and floating-point variables.
  */
 static inline float fp16_ieee_to_fp32_value(uint16_t h) {
-#if FP16_USE_FLOAT16_TYPE
-	union {
-		uint16_t as_bits;
-		_Float16 as_value;
-	} fp16 = { h };
-	return (float) fp16.as_value;
-#elif FP16_USE_FP16_TYPE
-	union {
-		uint16_t as_bits;
-		__fp16 as_value;
-	} fp16 = { h };
-	return (float) fp16.as_value;
+#if FP16_USE_NATIVE_CONVERSION
+	#if FP16_USE_FLOAT16_TYPE
+		union {
+			uint16_t as_bits;
+			_Float16 as_value;
+		} fp16 = { h };
+		return (float) fp16.as_value;
+	#elif FP16_USE_FP16_TYPE
+		union {
+			uint16_t as_bits;
+			__fp16 as_value;
+		} fp16 = { h };
+		return (float) fp16.as_value;
+	#else
+		#if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__)
+			return _cvtsh_ss((unsigned short) h);
+		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
+			return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128((int) (unsigned int) h)));
+		#elif defined(_M_ARM64) || defined(__aarch64__)
+			return vget_lane_f32(vcvt_f32_f16(vreinterpret_f16_u16(vdup_n_u16(h))), 0);
+		#else
+			#error "Archtecture- or compiler-specific implementation required"
+		#endif
+	#endif
 #else
 	/*
 	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
@@ -236,18 +254,30 @@ static inline float fp16_ieee_to_fp32_value(uint16_t h) {
  * floating-point operations and bitcasts between integer and floating-point variables.
  */
 static inline uint16_t fp16_ieee_from_fp32_value(float f) {
-#if FP16_USE_FLOAT16_TYPE
-	union {
-		_Float16 as_value;
-		uint16_t as_bits;
-	} fp16 = { (_Float16) f };
-	return fp16.as_bits;
-#elif FP16_USE_FP16_TYPE
-	union {
-		__fp16 as_value;
-		uint16_t as_bits;
-	} fp16 = { (__fp16) f };
-	return fp16.as_bits;
+#if FP16_USE_NATIVE_CONVERSION
+	#if FP16_USE_FLOAT16_TYPE
+		union {
+			_Float16 as_value;
+			uint16_t as_bits;
+		} fp16 = { (_Float16) f };
+		return fp16.as_bits;
+	#elif FP16_USE_FP16_TYPE
+		union {
+			__fp16 as_value;
+			uint16_t as_bits;
+		} fp16 = { (__fp16) f };
+		return fp16.as_bits;
+	#else
+		#if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__)
+			return _cvtss_sh(f, _MM_FROUND_CUR_DIRECTION);
+		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
+			return (uint16_t) _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(f), _MM_FROUND_CUR_DIRECTION));
+		#elif defined(_M_ARM64) || defined(__aarch64__)
+			return vget_lane_u16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
+		#else
+			#error "Archtecture- or compiler-specific implementation required"
+		#endif
+	#endif
 #else
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
 	const float scale_to_inf = 0x1.0p+112f;
diff --git a/include/fp16/macros.h b/include/fp16/macros.h
index 2503f63..4018b0c 100644
--- a/include/fp16/macros.h
+++ b/include/fp16/macros.h
@@ -2,6 +2,20 @@
 #ifndef FP16_MACROS_H
 #define FP16_MACROS_H
 
+#ifndef FP16_USE_NATIVE_CONVERSION
+	#if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__)
+		#define FP16_USE_NATIVE_CONVERSION 1
+	#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
+		#define FP16_USE_NATIVE_CONVERSION 1
+	#elif defined(_MSC_VER) && defined(_M_ARM64)
+		#define FP16_USE_NATIVE_CONVERSION 1
+	#elif defined(__GNUC__) && defined(__aarch64__)
+		#define FP16_USE_NATIVE_CONVERSION 1
+	#endif
+	#if !defined(FP16_USE_NATIVE_CONVERSION)
+		#define FP16_USE_NATIVE_CONVERSION 0
+	#endif  // !defined(FP16_USE_NATIVE_CONVERSION)
+#endif  // !define(FP16_USE_NATIVE_CONVERSION)
 
 #ifndef FP16_USE_FLOAT16_TYPE
 	#if !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(__GNUC__) && (__GNUC__ >= 12)