Merge pull request #170 from munroesj52/stable/1.0.4-4

Stable/1.0.4 4
open-power-sdk · Sep 16, 2022 · e3ecd3b · e3ecd3b
2 parents 6075912 + c661e85
commit e3ecd3b
Show file tree

Hide file tree

Showing 9 changed files with 122 additions and 58 deletions.
diff --git a/configure b/configure
@@ -13430,7 +13430,7 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-
+# remove AC_PROG_LIBTOOL for autotools 2.71
 
 # This directive is to avoid buggy libtool that doesn't add the '-Wl,--no-as-needed'
 # directive in the correct position of LDFLAGS

diff --git a/configure.ac b/configure.ac
@@ -20,7 +20,7 @@ PVECLIB_SO_VERSION=1:4:0
 AC_SUBST(PVECLIB_SO_VERSION)
 
 AC_PROG_CC
-LT_INIT
+# remove AC_PROG_LIBTOOL for autotools 2.71
 
 # This directive is to avoid buggy libtool that doesn't add the '-Wl,--no-as-needed'
 # directive in the correct position of LDFLAGS

diff --git a/src/pveclib/vec_f128_ppc.h b/src/pveclib/vec_f128_ppc.h
@@ -195,12 +195,31 @@ test_cosf128 (__binary128 value)
    -mcu=power9 and -mfloat128.
    So far clang does not support/define the __ibm128 type. */
 #ifdef __FLOAT128__
-typedef __float128 __Float128;
+#ifndef __clang__
+// For now assume the not __clang__ implies GCC
+// Can't just #ifdef __GNUC__ as Clang defined it
+#ifdef __float128
+// Can assume GCC 7 or later so ...
+// That version defines __ieee128 internally and
+// #defines __float128 to __ieee128, so both are defined
+// Define __binary128 so both GCC and CLang can use a single type
+#define __binary128 __ieee128
+#else
+// Assume GCC 6 or earlier
+// So the compiler defines __float128 only
 typedef __float128 __binary128;
 typedef __float128 __ieee128;
-#ifndef __clang__
+#endif
+#if (__GNUC__ < 7)
+typedef __float128 _Float128;
+#endif
 typedef __ibm128 __IBM128;
 #else
+/* Clang started defining __FLOAT128__ and does not allow redefining
+   __float128 or __ieee128. Worse it will give errors if you try to
+   use either type. So define __binary128 as if __FLOAT128__ is not
+   defined. */
+typedef vui128_t __binary128;
 /* Clang does not define __ibm128 over IBM long double.
    So defined it here. */
 typedef long double __IBM128;

diff --git a/src/pveclib/vec_f32_ppc.h b/src/pveclib/vec_f32_ppc.h
@@ -774,31 +774,52 @@ vec_any_iszerof32 (vf32_t vf32)
 #endif
 }
 
-/** \brief Copy the sign bit from vf32y merged with magnitude from
- *  vf32x and return the resulting vector float values.
+/** \brief Copy the sign bit from vf32x merged with magnitude from
+ *  vf32y and return the resulting vector float values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf32x vector float values containing the magnitudes.
- *  @param vf32y vector float values containing the sign bits.
- *  @return vector float values with magnitude from vf32x and the
- *  sign of vf32y.
+ *  @param vf32x vector float values containing the sign bits.
+ *  @param vf32y vector float values containing the magnitudes.
+ *  @return vector float values with magnitude from vf32y and the
+ *  sign of vf32x.
  */
 static inline vf32_t
 vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
 {
 #if _ARCH_PWR7
-  /* P9 has a 2 cycle xvcpsgnsp and eliminates a const load. */
+#ifdef PVECLIB_CPSGN_FIXED
   return (vec_cpsgn (vf32x, vf32y));
+#else
+  vf32_t result;
+  __asm__(
+      "xvcpsgnsp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf32x), "wa" (vf32y)
+      :);
+  return (result);
+#endif
 #else
   const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,
       0x80000000, 0x80000000);
   vf32_t result;
 
-  result = (vf32_t)vec_sel ((vui32_t)vf32x, (vui32_t)vf32y, signmask);
+  result = (vf32_t)vec_sel ((vui32_t)vf32y, (vui32_t)vf32x, signmask);
   return (result);
 #endif
 }

diff --git a/src/pveclib/vec_f64_ppc.h b/src/pveclib/vec_f64_ppc.h
@@ -773,31 +773,53 @@ vec_any_iszerof64 (vf64_t vf64)
 #endif
 }
 
-/** \brief Copy the sign bit from vf64y merged with magnitude from
- *  vf64x and return the resulting vector double values.
+/** \brief Copy the sign bit from vf64x merged with magnitude from
+ *  vf64y and return the resulting vector double values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf64x vector double values containing the magnitudes.
- *  @param vf64y vector double values containing the sign bits.
- *  @return vector double values with magnitude from vf64x and the
- *  sign of vf64y.
+ *  @param vf64x vector double values containing the sign bits.
+ *  @param vf64y vector double values containing the magnitudes.
+ *  @return vector double values with magnitude from vf64y and the
+ *  sign of vf64x.
  */
 static inline vf64_t
-vec_copysignf64 (vf64_t vf64x , vf64_t vf64y)
+vec_copysignf64 (vf64_t vf64x, vf64_t vf64y)
 {
 #if _ARCH_PWR7
   /* P9 has a 2 cycle xvcpsgndp and eliminates a const load. */
-	return (vec_cpsgn (vf64x, vf64y));
+#ifdef PVECLIB_CPSGN_FIXED
+  return (vec_cpsgn (vf64x, vf64y));
+#else
+  vf64_t result;
+  __asm__(
+      "xvcpsgndp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf64x), "wa" (vf64y)
+      :);
+  return (result);
+#endif
 #else
-	const vui32_t signmask  = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
-	vf64_t result;
+  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
+  vf64_t result;
 
-	result  = (vf64_t)vec_sel ((vui32_t)vf64x, (vui32_t)vf64y, signmask);
-	return (result);
+  result = (vf64_t) vec_sel ((vui32_t) vf64y, (vui32_t) vf64x, signmask);
+  return (result);
 #endif
 }
 

diff --git a/src/testsuite/arith128_test_f32.c b/src/testsuite/arith128_test_f32.c
@@ -1186,7 +1186,7 @@ test_float_cpsgn (void)
 
   i = (vf32_t) { 0.0, -0.0, 0.0, -0.0 };
   j = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
-  e = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
+  e = (vf32_t) { 0.0, -0.0, 0.0, -0.0 };
   k = vec_copysignf32 (i, j);
 
 #ifdef __DEBUG_PRINT__
@@ -1196,9 +1196,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 1:", k, e);
 
-  i = (vf32_t) { __FLT_MAX__, __FLT_MIN__, __FLT_EPSILON__,
+  i = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
+  j = (vf32_t) { __FLT_MAX__, __FLT_MIN__, __FLT_EPSILON__,
 		  __FLT_DENORM_MIN__ };
-  j = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
   e = (vf32_t) { -(__FLT_MAX__), __FLT_MIN__, -(__FLT_EPSILON__),
 		  __FLT_DENORM_MIN__ };
   k = vec_copysignf32 (i, j);
@@ -1210,9 +1210,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 2:", k, e);
 
-  i = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
+  i = (vf32_t) CONST_VINT32_W(0.0, -0.0, 0.0, -0.0);
+  j = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
 			       __FLOAT_NINF);
-  j = (vf32_t) CONST_VINT32_W(0.0, -0.0, 0.0, -0.0);
   e = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
 			       __FLOAT_NINF);
   k = vec_copysignf32 (i, j);
@@ -1224,9 +1224,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 3:", k, e);
 
-  i = (vf32_t) CONST_VINT128_W(__FLOAT_NAN, __FLOAT_NNAN, __FLOAT_NSNAN,
+  i = (vf32_t) {-0.0, 0.0, 0.0, -0.0 };
+  j = (vf32_t) CONST_VINT128_W(__FLOAT_NAN, __FLOAT_NNAN, __FLOAT_NSNAN,
 			       __FLOAT_SNAN);
-  j = (vf32_t) {-0.0, 0.0, 0.0, -0.0 };
   e = (vf32_t) CONST_VINT128_W(__FLOAT_NNAN, __FLOAT_NAN, __FLOAT_SNAN,
 			       __FLOAT_NSNAN);
   k = vec_copysignf32 (i, j);

diff --git a/src/testsuite/arith128_test_f64.c b/src/testsuite/arith128_test_f64.c
@@ -1596,7 +1596,7 @@ test_double_cpsgn (void)
 
   i = (vf64_t) { 0.0, -0.0 };
   j = (vf64_t) { -0.0, 0.0 };
-  e = (vf64_t) { -0.0, 0.0 };
+  e = (vf64_t) { 0.0, -0.0 };
   k = vec_copysignf64 (i, j);
 
 #ifdef __DEBUG_PRINT__
@@ -1606,8 +1606,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 1:", k, e);
 
-  i = (vf64_t) { __DBL_MAX__, __DBL_MIN__ };
-  j = (vf64_t) { -0.0, 0.0 };
+  i = (vf64_t) { -0.0, 0.0 };
+  j = (vf64_t) { __DBL_MAX__, __DBL_MIN__ };
   e = (vf64_t) { -(__DBL_MAX__), __DBL_MIN__ };
   k = vec_copysignf64 (i, j);
 
@@ -1618,8 +1618,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 2:", k, e);
 
-  i = (vf64_t) { __DBL_EPSILON__, __DBL_DENORM_MIN__ };
-  j = (vf64_t) { -0.0, 0.0 };
+  i = (vf64_t) { -0.0, 0.0 };
+  j = (vf64_t) { __DBL_EPSILON__, __DBL_DENORM_MIN__ };
   e = (vf64_t) { -(__DBL_EPSILON__), __DBL_DENORM_MIN__ };
   k = vec_copysignf64 (i, j);
 
@@ -1630,8 +1630,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 3:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
-  j = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  i = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   k = vec_copysignf64 (i, j);
 
@@ -1642,8 +1642,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 4:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
-  j = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  i = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   k = vec_copysignf64 (i, j);
 
@@ -1654,8 +1654,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 5:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_NAN, __DOUBLE_NNAN);
-  j = (vf64_t) CONST_VINT64_DW( -0.0, 0.0 );
+  i = (vf64_t) CONST_VINT64_DW( -0.0, 0.0 );
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_NAN, __DOUBLE_NNAN);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_NNAN, __DOUBLE_NAN);
   k = vec_copysignf64 (i, j);
 
@@ -1666,8 +1666,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 6:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_NSNAN, __DOUBLE_SNAN);
-  j = (vf64_t) CONST_VINT64_DW ( 0.0, -0.0 );
+  i = (vf64_t) CONST_VINT64_DW ( 0.0, -0.0 );
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_NSNAN, __DOUBLE_SNAN);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_SNAN, __DOUBLE_NSNAN);
   k = vec_copysignf64 (i, j);
 

diff --git a/src/testsuite/arith128_test_i128.c b/src/testsuite/arith128_test_i128.c
@@ -2308,7 +2308,8 @@ test_msumudm (void)
 int
 test_muludq (void)
 {
-  vui32_t i, j, k, l /*, m*/;
+  vui32_t i, j, k/*, l , m*/;
+  vui128_t l;
   vui32_t e, ec;
   int rc = 0;
 
@@ -2383,7 +2384,8 @@ test_muludq (void)
 int
 test_madduq (void)
 {
-  vui32_t i, j, k, l, m, n;
+  vui32_t i, j, k, m, n;
+  vui128_t l;
   vui32_t e, ec;
   int rc = 0;