Skip to content

Commit

Permalink
Fix use of inline ASM labels (#65)
Browse files Browse the repository at this point in the history
These labels need to be local, otherwise they're retained in the compiled static library. At link time, some linkers (notably macOS) interpret them as new functions that can be safely stripped from the linked binary since the labels are unreferenced.

This goes wrong on macOS when you pass the `-dead_strip` linker flag, [which `rustc` does](https://github.com/rust-lang/rust/blob/a9baa16482ba4100df529ccba39c787f27ad0475/compiler/rustc_codegen_ssa/src/back/linker.rs#L552).
  • Loading branch information
jamesbornholt authored Jun 5, 2023
1 parent f8008b3 commit a5b0e7f
Showing 1 changed file with 21 additions and 19 deletions.
40 changes: 21 additions & 19 deletions source/intel/asm/crc32c_sse42_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,30 @@

#include <aws/common/cpuid.h>

/* clang-format off */

/* this implementation is only for the x86_64 intel architecture */
#if defined(__x86_64__)
# if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdollar-in-identifier-extension"
# endif

/* use local labels, so that linker doesn't think these are functions it can deadstrip */
# ifdef __APPLE__
# define LABEL(label) "L_" #label "_%="
# else
# define LABEL(label) ".L_" #label "_%="
# endif

/*
* Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using
* the specified Magic Constants K1 and K2.
* Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding
* Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11
* Result is placed in ecx
*/
# define FOLD_K1K2(NAME, K1, K2) \
"fold_k1k2_" #NAME "_%=: \n" \
# define FOLD_K1K2(K1, K2) \
"movl " #K1 ", %%r8d # Magic K1 constant \n" \
"movl " #K2 ", %%r9d # Magic K2 constant \n" \
"movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \
Expand Down Expand Up @@ -50,8 +58,6 @@
*/
static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) {
__asm__ __volatile__(
"enter_256_%=:"

"xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
"xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"

Expand Down Expand Up @@ -98,7 +104,7 @@ static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t c
"crc32q 80(%[in]), %%rcx # crc0 \n"
"crc32q 168(%[in]), %%r11 # crc2 \n"

FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */
FOLD_K1K2($0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */

/* output registers
[crc] is an input and and output so it is marked read/write (i.e. "+c")*/
Expand All @@ -121,14 +127,12 @@ static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t c
*/
static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
__asm__ __volatile__(
"enter_1024_%=:"

"xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
"xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"

"movl $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"

"loop_1024_%=:"
LABEL(loop_1024) ": \n"

"prefetcht0 128(%[in]) # \n"
"prefetcht0 472(%[in]) # \n"
Expand Down Expand Up @@ -168,7 +172,7 @@ static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t

"add $64, %[in] # \n"
"sub $1, %%r8d # \n"
"jnz loop_1024_%= # \n"
"jnz " LABEL(loop_1024) " # \n"

"crc32q 0(%[in]), %%rcx # crc0 \n"
"crc32q 344(%[in]), %%r11 # crc1 \n"
Expand All @@ -181,7 +185,7 @@ static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t
"crc32q 16(%[in]), %%rcx # crc0 \n"
"crc32q 696(%[in]), %%r10 # crc2 \n"

FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
FOLD_K1K2($0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
output registers
[crc] is an input and and output so it is marked read/write (i.e. "+c")
Expand All @@ -205,14 +209,12 @@ static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t
*/
static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) {
__asm__ __volatile__(
"enter_3072_%=:"

"xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
"xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"

"movl $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n"

"loop_3072_%=:"
LABEL(loop_3072) ": \n"

"prefetcht0 128(%[in]) # \n"
"prefetcht0 1152(%[in]) # \n"
Expand Down Expand Up @@ -252,10 +254,9 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t

"add $64, %[in] # \n"
"sub $1, %%r8d # \n"
"jnz loop_3072_%= # \n"
"jnz " LABEL(loop_3072) " # \n"

FOLD_K1K2(
3072,
$0xa51b6135,
$0x170076fa) /* Magic Constants used to fold crc stripes into ecx
Expand Down Expand Up @@ -297,7 +298,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (AWS_UNLIKELY(length < 8)) {
while (length-- > 0) {
__asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
__asm__("CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
input++;
}
return ~crc;
Expand All @@ -314,7 +315,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev

/* spin through the leading unaligned input bytes (if any) one-by-one */
while (leading-- > 0) {
__asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
__asm__("CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
input++;
}

Expand Down Expand Up @@ -344,14 +345,14 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
while (AWS_LIKELY(length >= 8)) {
/* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */
__asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : [ crc ] "+c"(crc) : [ in ] "r"(input));
__asm__ __volatile__("CRC32Q (%[in]), %%rcx" : [ crc ] "+c"(crc) : [ in ] "r"(input));
input += 8;
length -= 8;
}

/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
while (length-- > 0) {
__asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
__asm__ __volatile__("CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
input++;
}

Expand All @@ -375,3 +376,4 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
}

#endif
/* clang-format on */

0 comments on commit a5b0e7f

Please sign in to comment.