Fix use of inline ASM labels (#65)

These labels need to be local, otherwise they're retained in the compiled static library. At link time, some linkers (notably macOS) interpret them as new functions that can be safely stripped from the linked binary since the labels are unreferenced. This goes wrong on macOS when you pass the `-dead_strip` linker flag, [which `rustc` does](https://github.com/rust-lang/rust/blob/a9baa16482ba4100df529ccba39c787f27ad0475/compiler/rustc_codegen_ssa/src/back/linker.rs#L552).
awslabs · Jun 5, 2023 · a5b0e7f · a5b0e7f
1 parent f8008b3
commit a5b0e7f
Showing 1 changed file with 21 additions and 19 deletions.
diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
@@ -7,22 +7,30 @@
 
 #include <aws/common/cpuid.h>
 
+/* clang-format off */
+
 /* this implementation is only for the x86_64 intel architecture */
 #if defined(__x86_64__)
 #    if defined(__clang__)
 #        pragma clang diagnostic push
 #        pragma clang diagnostic ignored "-Wdollar-in-identifier-extension"
 #    endif
 
+/* use local labels, so that linker doesn't think these are functions it can deadstrip */
+#    ifdef __APPLE__
+#        define LABEL(label) "L_" #label "_%="
+#    else
+#        define LABEL(label) ".L_" #label "_%="
+#    endif
+
 /*
  * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using
  * the specified Magic Constants K1 and K2.
  * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding
  * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11
  * Result is placed in ecx
  */
-#    define FOLD_K1K2(NAME, K1, K2)                                                                                    \
-        "fold_k1k2_" #NAME "_%=: \n"                                                                                   \
+#    define FOLD_K1K2(K1, K2)                                                                                          \
         "movl             " #K1 ", %%r8d    # Magic K1 constant \n"                                                    \
         "movl             " #K2 ", %%r9d    # Magic K2 constant \n"                                                    \
         "movq              %%rcx, %%xmm1   # crc0 into lower dword of xmm1 \n"                                         \
@@ -50,8 +58,6 @@
  */
 static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) {
     __asm__ __volatile__(
-        "enter_256_%=:"
-
         "xor          %%r11, %%r11    # zero all 64 bits in r11, will track crc1 \n"
         "xor          %%r10, %%r10    # zero all 64 bits in r10, will track crc2 \n"
 
@@ -98,7 +104,7 @@ static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t c
         "crc32q   80(%[in]), %%rcx    # crc0 \n"
         "crc32q  168(%[in]), %%r11    # crc2 \n"
 
-        FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */
+        FOLD_K1K2($0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */
 
         /* output registers
          [crc] is an input and and output so it is marked read/write (i.e. "+c")*/
@@ -121,14 +127,12 @@ static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t c
  */
 static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
     __asm__ __volatile__(
-        "enter_1024_%=:"
-
         "xor          %%r11, %%r11    # zero all 64 bits in r11, will track crc1 \n"
         "xor          %%r10, %%r10    # zero all 64 bits in r10, will track crc2 \n"
 
         "movl            $5, %%r8d    # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"
 
-        "loop_1024_%=:"
+        LABEL(loop_1024) ": \n"
 
         "prefetcht0  128(%[in])       # \n"
         "prefetcht0  472(%[in])       # \n"
@@ -168,7 +172,7 @@ static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t
 
         "add            $64, %[in]    # \n"
         "sub             $1, %%r8d    # \n"
-        "jnz loop_1024_%=             # \n"
+        "jnz " LABEL(loop_1024) "     # \n"
 
         "crc32q    0(%[in]), %%rcx    # crc0 \n"
         "crc32q  344(%[in]), %%r11    # crc1 \n"
@@ -181,7 +185,7 @@ static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t
         "crc32q   16(%[in]), %%rcx    # crc0 \n"
         "crc32q  696(%[in]), %%r10    # crc2 \n"
 
-        FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
+        FOLD_K1K2($0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
 
                             output registers
                             [crc] is an input and and output so it is marked read/write (i.e. "+c")
@@ -205,14 +209,12 @@ static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t
  */
 static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) {
     __asm__ __volatile__(
-        "enter_3072_%=:"
-
         "xor          %%r11, %%r11    # zero all 64 bits in r11, will track crc1 \n"
         "xor          %%r10, %%r10    # zero all 64 bits in r10, will track crc2 \n"
 
         "movl           $16, %%r8d    # Loop 16 times through 64 byte chunks in 3 parallel stripes \n"
 
-        "loop_3072_%=:"
+        LABEL(loop_3072) ": \n"
 
         "prefetcht0  128(%[in])       # \n"
         "prefetcht0 1152(%[in])       # \n"
@@ -252,10 +254,9 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t
 
         "add            $64, %[in]    # \n"
         "sub             $1, %%r8d    # \n"
-        "jnz loop_3072_%=             # \n"
+        "jnz " LABEL(loop_3072) "     # \n"
 
         FOLD_K1K2(
-            3072,
             $0xa51b6135,
             $0x170076fa) /* Magic Constants used to fold crc stripes into ecx
 
@@ -297,7 +298,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
     /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
     if (AWS_UNLIKELY(length < 8)) {
         while (length-- > 0) {
-            __asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
+            __asm__("CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
             input++;
         }
         return ~crc;
@@ -314,7 +315,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 
     /* spin through the leading unaligned input bytes (if any) one-by-one */
     while (leading-- > 0) {
-        __asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
+        __asm__("CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
         input++;
     }
 
@@ -344,14 +345,14 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
     /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
     while (AWS_LIKELY(length >= 8)) {
         /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */
-        __asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : [ crc ] "+c"(crc) : [ in ] "r"(input));
+        __asm__ __volatile__("CRC32Q (%[in]), %%rcx" : [ crc ] "+c"(crc) : [ in ] "r"(input));
         input += 8;
         length -= 8;
     }
 
     /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
     while (length-- > 0) {
-        __asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
+        __asm__ __volatile__("CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
         input++;
     }
 
@@ -375,3 +376,4 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 }
 
 #endif
+/* clang-format on */