Fix bug in approx::exp(bfloat16) for HIP

KernelTuner · Dec 2, 2024 · 846de1f · 846de1f
1 parent a2b08a5
commit 846de1f
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 10 deletions.
diff --git a/include/kernel_float/approx.h b/include/kernel_float/approx.h
@@ -10,7 +10,9 @@ namespace kernel_float {
 namespace approx {
 
 static_assert(sizeof(unsigned int) * 8 == 32, "invalid size of unsigned int");
+static_assert(sizeof(unsigned short) * 8 == 16, "invalid size of unsigned short");
 using uint32_t = unsigned int;
+using uint16_t = unsigned short;
 
 template<typename T, typename U>
 KERNEL_FLOAT_DEVICE T transmute(const U& input) {
@@ -353,12 +355,12 @@ KERNEL_FLOAT_DEVICE bfloat16x2_t exp(bfloat16x2_t arg) {
     static constexpr float OFFSET = 382.4958400542335;
     static constexpr float MINIMUM = 382;
 
-    float a = fmaxf(fmaf(bfloat162float(arg.x), SCALE, OFFSET), MINIMUM);
-    float b = fmaxf(fmaf(bfloat162float(arg.y), SCALE, OFFSET), MINIMUM);
+    float a = fmaxf(fmaf(__bfloat162float(arg.x), SCALE, OFFSET), MINIMUM);
+    float b = fmaxf(fmaf(__bfloat162float(arg.y), SCALE, OFFSET), MINIMUM);
 
     return {
-        transmute<__bfloat16>(uint16_t(transmute<uint32_t>(a))),
-        transmute<__bfloat16>(uint16_t(transmute<uint32_t>(b)))};
+        transmute<bfloat16_t>(uint16_t(transmute<uint32_t>(a))),
+        transmute<bfloat16_t>(uint16_t(transmute<uint32_t>(b)))};
 }
 #endif
 }  // namespace approx

diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2024-11-26 14:20:49.081641
-// git hash: 76c695a4cc5b13b3d5841ac5085574a5b47a299c
+// date: 2024-12-02 10:59:19.296684
+// git hash: a2b08a56e31d1c9a6302c8a49c740cf56fcc1607
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -4535,7 +4535,9 @@ namespace kernel_float {
 namespace approx {
 
 static_assert(sizeof(unsigned int) * 8 == 32, "invalid size of unsigned int");
+static_assert(sizeof(unsigned short) * 8 == 16, "invalid size of unsigned short");
 using uint32_t = unsigned int;
+using uint16_t = unsigned short;
 
 template<typename T, typename U>
 KERNEL_FLOAT_DEVICE T transmute(const U& input) {
@@ -4878,12 +4880,12 @@ KERNEL_FLOAT_DEVICE bfloat16x2_t exp(bfloat16x2_t arg) {
     static constexpr float OFFSET = 382.4958400542335;
     static constexpr float MINIMUM = 382;
 
-    float a = fmaxf(fmaf(bfloat162float(arg.x), SCALE, OFFSET), MINIMUM);
-    float b = fmaxf(fmaf(bfloat162float(arg.y), SCALE, OFFSET), MINIMUM);
+    float a = fmaxf(fmaf(__bfloat162float(arg.x), SCALE, OFFSET), MINIMUM);
+    float b = fmaxf(fmaf(__bfloat162float(arg.y), SCALE, OFFSET), MINIMUM);
 
     return {
-        transmute<__bfloat16>(uint16_t(transmute<uint32_t>(a))),
-        transmute<__bfloat16>(uint16_t(transmute<uint32_t>(b)))};
+        transmute<bfloat16_t>(uint16_t(transmute<uint32_t>(a))),
+        transmute<bfloat16_t>(uint16_t(transmute<uint32_t>(b)))};
 }
 #endif
 }  // namespace approx