From 4602f9aab6290b74c1db182876868eea6ac3d484 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <albin.ahlback@gmail.com>
Date: Tue, 14 May 2024 01:05:57 +0200
Subject: [PATCH] Optimize n_revbin for Arm64

And remove byte_swap from longlong.h
---
 doc/source/longlong.rst   |  8 -----
 src/longlong.h            | 28 ----------------
 src/longlong_msc_arm64.h  |  3 --
 src/longlong_msc_x86.h    |  6 ----
 src/test/main.c           |  2 --
 src/test/t-byte_swap.c    | 68 ---------------------------------------
 src/ulong_extras/revbin.c | 61 +++++++++++++++++++++++++++++++----
 7 files changed, 54 insertions(+), 122 deletions(-)
 delete mode 100644 src/test/t-byte_swap.c

diff --git a/doc/source/longlong.rst b/doc/source/longlong.rst
index e4b84253f2..3c6acd21cb 100644
--- a/doc/source/longlong.rst
+++ b/doc/source/longlong.rst
@@ -79,11 +79,3 @@ Division
 
     Works like ``udiv_qrnnd``, but takes a precomputed inverse ``di`` as 
     computed by ::func::`n_preinvert_limb`.
-
-Miscellaneous
--------------------------------------------------------------------------------
-
-.. macro:: byte_swap(x)
-
-    Swap the order of the bytes in the word `x`, i.e. most significant byte
-    becomes least significant byte, etc.
diff --git a/src/longlong.h b/src/longlong.h
index 285aec8b35..3fc7808923 100644
--- a/src/longlong.h
+++ b/src/longlong.h
@@ -34,11 +34,6 @@ extern "C" {
 #  define flint_ctz __builtin_ctzl
 # endif
 
-/* Byte swap */
-# define _FLINT_CAT_(X,Y) X##Y
-# define _FLINT_CAT(X,Y) _FLINT_CAT_(X,Y)
-# define byte_swap(x) do { (x) = _FLINT_CAT(__builtin_bswap, FLINT_BITS)(x); } while (0)
-
 /* Addition, subtraction and multiplication */
 # if defined(__clang__)
 #  include "longlong_asm_clang.h"
@@ -97,29 +92,6 @@ static inline int flint_ctz(ulong x)
 }
 #endif
 
-/* Byte swap */
-#if !defined(byte_swap)
-# if FLINT_BITS == 32
-#  define byte_swap(n) \
-  do { \
-      /* swap adjacent bytes */ \
-      (n) = ((((n) & 0xff00ff00) >> 8) | (((n) & 0x00ff00ff) << 8)); \
-      /* swap adjacent words */ \
-      (n) = (((n) >> 16) | ((n) << 16)); \
-  } while (0)
-# else
-#  define byte_swap(n) \
-  do { \
-      /* swap adjacent bytes */ \
-      (n) = ((((n) & 0xff00ff00ff00ff00) >> 8) | (((n) & 0x00ff00ff00ff00ff) << 8)); \
-      /* swap adjacent words */ \
-      (n) = ((((n) & 0xffff0000ffff0000) >> 16) | (((n) & 0x0000ffff0000ffff) << 16)); \
-      /* swap adjacent double words */ \
-      (n) = (((n) >> 32) | ((n) << 32)); \
-  } while (0)
-# endif
-#endif
-
 /* Addition and subtraction */
 #if !defined(add_ssaaaa)
 # define add_ssaaaa(s1, s0, a1, a0, b1, b0) \
diff --git a/src/longlong_msc_arm64.h b/src/longlong_msc_arm64.h
index 045a8d96f4..2147484279 100644
--- a/src/longlong_msc_arm64.h
+++ b/src/longlong_msc_arm64.h
@@ -26,9 +26,6 @@ static inline int flint_ctz(ulong x)
     return index;
 }
 
-/* Byte swap */
-# define byte_swap(x) do { (x) = _byteswap_uint64(x); } while (0)
-
 /* Multiplication */
 #define umul_ppmm(r1, r0, u, v) \
 do \
diff --git a/src/longlong_msc_x86.h b/src/longlong_msc_x86.h
index 9a718b1533..79aa353b64 100644
--- a/src/longlong_msc_x86.h
+++ b/src/longlong_msc_x86.h
@@ -22,9 +22,6 @@
 # define flint_clz _lzcnt_u32
 # define flint_ctz _tzcnt_u32
 
-/* Byte swap */
-# define byte_swap(x) do { (x) = _byteswap_ulong(x); } while (0)
-
 /* Addition and subtraction */
 # define _FLINT_ADC _addcarry_u32
 # define _FLINT_SBB _subborrow_u32
@@ -56,9 +53,6 @@ do \
 # define flint_clz _lzcnt_u64
 # define flint_ctz _tzcnt_u64
 
-/* Byte swap */
-# define byte_swap(x) do { (x) = _byteswap_uint64(x); } while (0)
-
 /* Addition and subtraction */
 # define _FLINT_ADC _addcarry_u64
 # define _FLINT_SBB _subborrow_u64
diff --git a/src/test/main.c b/src/test/main.c
index b4e844489c..0243785ac4 100644
--- a/src/test/main.c
+++ b/src/test/main.c
@@ -14,7 +14,6 @@
 #include "t-add_ssaaaa.c"
 #include "t-add_sssaaaaaa.c"
 #include "t-add_ssssaaaaaaaa.c"
-#include "t-byte_swap.c"
 #include "t-flint_clz.c"
 #include "t-flint_ctz.c"
 #include "t-io.c"
@@ -34,7 +33,6 @@ test_struct tests[] =
     TEST_FUNCTION(add_ssaaaa),
     TEST_FUNCTION(add_sssaaaaaa),
     TEST_FUNCTION(add_ssssaaaaaaaa),
-    TEST_FUNCTION(byte_swap),
     TEST_FUNCTION(flint_clz),
     TEST_FUNCTION(flint_ctz),
     TEST_FUNCTION(flint_fprintf),
diff --git a/src/test/t-byte_swap.c b/src/test/t-byte_swap.c
deleted file mode 100644
index 6d26fd8b18..0000000000
--- a/src/test/t-byte_swap.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
-    Copyright (C) 2015 William Hart
-
-    This file is part of FLINT.
-
-    FLINT is free software: you can redistribute it and/or modify it under
-    the terms of the GNU Lesser General Public License (LGPL) as published
-    by the Free Software Foundation; either version 3 of the License, or
-    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
-*/
-
-#include "ulong_extras.h"
-#include "test_helpers.h"
-
-ulong byte_swap_naive(ulong n)
-{
-    ulong r = 0;
-    slong i;
-
-    for (i = 0; i < sizeof(ulong); i++)
-    {
-        r <<= 8;
-        r |= (n & 0xFF);
-        n >>= 8;
-    }
-
-    return r;
-}
-
-TEST_FUNCTION_START(byte_swap, state)
-{
-    int i, result;
-
-    for (i = 0; i < 10000 * flint_test_multiplier(); i++)
-    {
-        ulong n, r1, r2;
-        int cs;
-
-        n = n_randtest(state);
-        r1 = n;
-
-        cs = n_randint(state, 2);
-
-        if (cs == 0)
-        {
-            /* byte_swap(byte_swap(n)) == n */
-            r2 = n;
-            byte_swap(r2);
-            byte_swap(r2);
-        }
-        else
-        {
-            /* byte_swap(n) == byte_swap_naive(n) */
-            r1 = n;
-            byte_swap(r1);
-            r2 = byte_swap_naive(n);
-        }
-
-        result = (r1 == r2);
-        if (!result)
-            TEST_FUNCTION_FAIL(
-                    "case %d\n"
-                    "n = %wx, r1 = %wx, r2 = %wx\n",
-                    n, r1, r2);
-    }
-
-    TEST_FUNCTION_END(state);
-}
diff --git a/src/ulong_extras/revbin.c b/src/ulong_extras/revbin.c
index fbb15d14a2..45482a2202 100644
--- a/src/ulong_extras/revbin.c
+++ b/src/ulong_extras/revbin.c
@@ -1,5 +1,6 @@
 /*
     Copyright (C) 2009, 2015 William Hart
+    Copyright (C) 2024 Albin Ahlbäck
 
     This file is part of FLINT.
 
@@ -9,9 +10,23 @@
     (at your option) any later version.  See <https://www.gnu.org/licenses/>.
 */
 
-#include "flint.h"
 #include "ulong_extras.h"
 
+#if defined(__GNUC__) && FLINT64 && defined(__aarch64__)
+# include <arm_acle.h>
+ulong
+n_revbin(ulong n, ulong b)
+{
+    FLINT_ASSERT(b <= FLINT_BITS);
+
+    n = __rbitll(n);
+
+    if (b == 0)
+        return 0;
+    else
+        return n >> (FLINT_BITS - b);
+}
+#else
 static const unsigned char flint_revtab[] = {
     0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0,
     0x30, 0xB0, 0x70, 0xF0,
@@ -47,9 +62,42 @@ static const unsigned char flint_revtab[] = {
     0x3F, 0xBF, 0x7F, 0xFF
 };
 
-/*
-   Computes the reverse binary representation of a number of b bits.
-*/
+#if defined(__GNUC__)
+# if FLINT64
+#  define byte_swap __builtin_bswap64
+# else
+#  define byte_swap __builtin_bswap32
+# endif
+#elif defined(_MSC_VER)
+# include <stdlib.h>
+# if FLINT64
+#  define byte_swap _byteswap_uint64
+# else
+#  define byte_swap _byteswap_ulong
+# endif
+#else
+# if FLINT64
+FLINT_FORCE_INLINE byte_swap(ulong n)
+{
+    /* swap adjacent bytes */
+    n = ((n & 0xff00ff00ff00ff00) >> 8) | ((n & 0x00ff00ff00ff00ff) << 8);
+    /* swap adjacent words */
+    n = ((n & 0xffff0000ffff0000) >> 16) | ((n & 0x0000ffff0000ffff) << 16);
+    /* swap adjacent double words */
+    n = (n >> 32) | (n << 32);
+    return n;
+}
+# else
+FLINT_FORCE_INLINE byte_swap(ulong n)
+{
+    /* swap adjacent bytes */
+    n = ((n & 0xff00ff00) >> 8) | ((n & 0x00ff00ff) << 8);
+    /* swap adjacent words */
+    n = (n >> 16) | (n << 16);
+    return n;
+}
+# endif
+#endif
 
 ulong
 n_revbin(ulong n, ulong b)
@@ -83,8 +131,7 @@ n_revbin(ulong n, ulong b)
         n = (((n & 0xf0f0f0f0) >> 4) | ((n & 0x0f0f0f0f) << 4));
 #endif
 
-        byte_swap(n);
-
-        return n >> (FLINT_BITS - b);
+        return byte_swap(n) >> (FLINT_BITS - b);
     }
 }
+#endif