From d67c3aeca643a1527ac1924f0707701f8706dc6c Mon Sep 17 00:00:00 2001 From: Christopher Chang Date: Thu, 28 Sep 2023 11:33:18 -0700 Subject: [PATCH] libdeflate 1.19, --clump test --- 2.0/Tests/TEST_PHASED_VCF/glm_compare.py | 27 +- 2.0/Tests/TEST_PHASED_VCF/run_tests.sh | 24 +- 2.0/libdeflate/common_defs.h | 156 +++- 2.0/libdeflate/lib/adler32.c | 7 +- 2.0/libdeflate/lib/arm/adler32_impl.h | 132 +-- 2.0/libdeflate/lib/arm/arm_cpu_features.c | 28 +- 2.0/libdeflate/lib/arm/cpu_features.h | 155 ++-- 2.0/libdeflate/lib/arm/crc32_impl.h | 142 +-- 2.0/libdeflate/lib/arm/crc32_pmull_helpers.h | 58 +- 2.0/libdeflate/lib/arm/crc32_pmull_wide.h | 34 +- 2.0/libdeflate/lib/arm/matchfinder_impl.h | 13 +- 2.0/libdeflate/lib/bt_matchfinder.h | 5 +- 2.0/libdeflate/lib/cpu_features_common.h | 6 +- 2.0/libdeflate/lib/crc32.c | 7 +- 2.0/libdeflate/lib/crc32_table.h | 526 ------------ 2.0/libdeflate/lib/crc32_vec_template.h | 61 -- 2.0/libdeflate/lib/decompress_template.h | 13 +- 2.0/libdeflate/lib/deflate_compress.c | 806 ++++++++++++------ 2.0/libdeflate/lib/deflate_decompress.c | 114 ++- 2.0/libdeflate/lib/gzip_compress.c | 6 +- 2.0/libdeflate/lib/gzip_decompress.c | 6 +- 2.0/libdeflate/lib/hc_matchfinder.h | 5 +- 2.0/libdeflate/lib/ht_matchfinder.h | 4 +- 2.0/libdeflate/lib/lib_common.h | 41 +- 2.0/libdeflate/lib/matchfinder_common.h | 4 +- 2.0/libdeflate/lib/unaligned.h | 228 ----- 2.0/libdeflate/lib/utils.c | 38 +- 2.0/libdeflate/lib/x86/adler32_impl.h | 171 ++-- 2.0/libdeflate/lib/x86/cpu_features.h | 107 ++- 2.0/libdeflate/lib/x86/crc32_impl.h | 11 +- .../lib/x86/crc32_pclmul_template.h | 69 +- 2.0/libdeflate/lib/x86/decompress_impl.h | 11 +- 2.0/libdeflate/lib/x86/x86_cpu_features.c | 117 ++- 2.0/libdeflate/lib/zlib_compress.c | 6 +- 2.0/libdeflate/lib/zlib_decompress.c | 6 +- 2.0/libdeflate/libdeflate.h | 203 +++-- 2.0/plink2.cc | 2 +- 37 files changed, 1583 insertions(+), 1766 deletions(-) delete mode 100644 2.0/libdeflate/lib/crc32_table.h delete mode 100644 2.0/libdeflate/lib/crc32_vec_template.h delete mode 100644 2.0/libdeflate/lib/unaligned.h diff --git a/2.0/Tests/TEST_PHASED_VCF/glm_compare.py b/2.0/Tests/TEST_PHASED_VCF/glm_compare.py index 4c9a32e5a..84ce1857d 100755 --- a/2.0/Tests/TEST_PHASED_VCF/glm_compare.py +++ b/2.0/Tests/TEST_PHASED_VCF/glm_compare.py @@ -61,6 +61,7 @@ def main(): a1_col2 = first_line2.index('A1') test_col2 = first_line2.index('TEST') obsct_col2 = first_line2.index('OBS_CT') + errcode_col2 = first_line2.index('ERRCODE') betaor_col2 = -1 stat_col2 = -1 is_odds_ratio = False @@ -102,16 +103,24 @@ def main(): row1[5] != row2[obsct_col2]: eprint('Header column mismatch between association files.') sys.exit(1) - if row1[6] != 'NA' and row2[betaor_col2] != 'NA': - val1 = float(row1[6]) - val2 = float(row2[betaor_col2]) - if is_odds_ratio: - # this is a more appropriate scale - val1 = math.log(val1) - val2 = math.log(val2) - if not float_compare_ok(val1, val2, tol): - eprint('BETA/OR mismatch.') + if row1[6] != 'NA': + # Apple clang 15.0.0 apparent-miscompilation issue is likely to + # manifest as an inappropriate ERRCODE=INVALID_RESULT. This is + # very unlikely to happen on random data, so error out and then + # manually pass if necessary. + if row2[errcode_col2] == 'INVALID_RESULT': + eprint('Unexpected INVALID_RESULT.') sys.exit(1) + if row2[betaor_col2] != 'NA': + val1 = float(row1[6]) + val2 = float(row2[betaor_col2]) + if is_odds_ratio: + # this is a more appropriate scale + val1 = math.log(val1) + val2 = math.log(val2) + if not float_compare_ok(val1, val2, tol): + eprint('BETA/OR mismatch.') + sys.exit(1) if row1[7] != 'NA' and row1[7] != 'inf' and row2[stat_col2] != 'NA': val1 = float(row1[7]) val2 = float(row2[stat_col2]) diff --git a/2.0/Tests/TEST_PHASED_VCF/run_tests.sh b/2.0/Tests/TEST_PHASED_VCF/run_tests.sh index 0d69657f0..ca03ad72b 100755 --- a/2.0/Tests/TEST_PHASED_VCF/run_tests.sh +++ b/2.0/Tests/TEST_PHASED_VCF/run_tests.sh @@ -91,7 +91,7 @@ diff -q plink2_pca.eigenvec plink2_pca_rfreq.eigenvec $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pca 3 approx biallelic-var-wts --out plink2_pca_approx python3 pca_compare.py -1 plink1_pca -2 plink2_pca_approx -t 0.009 -# Test --glm. +# Test --glm and --clump. # Generate random binary and quantitative phenotypes for 1kg_phase3_chr21 # samples, and verify regression results are consistent with plink 1.9 within a # tolerance (tbd: see if we can generate new phenotypes each time, or if it's @@ -120,9 +120,23 @@ $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm allow- python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm.PHENO1.glm.logistic -t 0.1 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm allow-no-covars no-firth --out plink2_glm_dbl python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm_dbl.PHENO1.glm.logistic -t 0.3 +plink --bfile plink1_data --clump plink2_glm_dbl.PHENO1.glm.logistic --clump-snp-field ID --clump-p1 0.1 --clump-p2 0.2 --out plink1_test +# Extract SNP, TOTAL, NSIG, S05, S01, S001, S0001, and SP2 columns. +# ($12 != "") check needed because plink 1.x puts two blank lines at the end of +# the .clumped file. +cat plink1_test.clumped | tail -n +2 | awk '{if ($12 == "NONE") $12 = "."; if ($12 != "") print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink1_test.clump_compare +plink2 --bfile plink1_data --clump cols=+f plink2_glm_dbl.PHENO1.glm.logistic --clump-p1 0.1 --clump-p2 0.2 --out plink2_test +cat plink2_test.clumps | tail -n +2 | awk '{print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink2_test.clump_compare +diff -q plink1_test.clump_compare plink2_test.clump_compare + plink --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --linear --allow-no-sex --out plink1_glm $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm allow-no-covars --out plink2_glm python3 glm_compare.py -1 plink1_glm.assoc.linear -2 plink2_glm.PHENO1.glm.linear -t 0.1 +plink --bfile plink1_data --clump plink2_glm.PHENO1.glm.linear --clump-snp-field ID --clump-p1 0.1 --clump-p2 0.2 --out plink1_test +cat plink1_test.clumped | tail -n +2 | awk '{if ($12 == "NONE") $12 = "."; if ($12 != "") print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink1_test.clump_compare +plink2 --bfile plink1_data --clump cols=+f plink2_glm.PHENO1.glm.linear --clump-p1 0.1 --clump-p2 0.2 --out plink2_test +cat plink2_test.clumps | tail -n +2 | awk '{print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink2_test.clump_compare +diff -q plink1_test.clump_compare plink2_test.clump_compare plink --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --logistic genotypic --allow-no-sex --out plink1_glm $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm allow-no-covars no-firth single-prec-cc genotypic --out plink2_glm @@ -132,6 +146,7 @@ python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm_dbl.PHENO1.glm plink --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --linear genotypic --allow-no-sex --out plink1_glm $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm allow-no-covars genotypic --out plink2_glm python3 glm_compare.py -1 plink1_glm.assoc.linear -2 plink2_glm.PHENO1.glm.linear -t 0.1 +# don't test --clump here since plink 1.x ignores TEST column plink --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --logistic --covar plink1_pca.eigenvec --allow-no-sex --out plink1_glm $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm no-firth single-prec-cc --covar plink2_pca.eigenvec --out plink2_glm @@ -141,6 +156,13 @@ python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm_dbl.PHENO1.glm plink --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --linear --covar plink1_pca.eigenvec --allow-no-sex --out plink1_glm $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm --covar plink2_pca.eigenvec --out plink2_glm python3 glm_compare.py -1 plink1_glm.assoc.linear -2 plink2_glm.PHENO1.glm.linear -t 0.1 +# hide-covar allows --clump test here +$1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm hide-covar --covar plink2_pca.eigenvec --out plink2_glm +plink --bfile plink1_data --clump plink2_glm.PHENO1.glm.linear --clump-snp-field ID --clump-p1 0.1 --clump-p2 0.2 --out plink1_test +cat plink1_test.clumped | tail -n +2 | awk '{if ($12 == "NONE") $12 = "."; if ($12 != "") print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink1_test.clump_compare +plink2 --bfile plink1_data --clump cols=+f plink2_glm.PHENO1.glm.linear --clump-p1 0.1 --clump-p2 0.2 --out plink2_test +cat plink2_test.clumps | tail -n +2 | awk '{print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink2_test.clump_compare +diff -q plink1_test.clump_compare plink2_test.clump_compare plink --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --logistic genotypic --covar plink1_pca.eigenvec --allow-no-sex --out plink1_glm $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm no-firth single-prec-cc genotypic --covar plink2_pca.eigenvec --out plink2_glm diff --git a/2.0/libdeflate/common_defs.h b/2.0/libdeflate/common_defs.h index cfe6fd62b..e1bc3fe09 100644 --- a/2.0/libdeflate/common_defs.h +++ b/2.0/libdeflate/common_defs.h @@ -28,16 +28,65 @@ #ifndef COMMON_DEFS_H #define COMMON_DEFS_H +#include "libdeflate.h" + #include #include /* for size_t */ #include #ifdef _MSC_VER +# include /* for _BitScan*() and other intrinsics */ # include /* for _byteswap_*() */ + /* Disable MSVC warnings that are expected. */ + /* /W2 */ +# pragma warning(disable : 4146) /* unary minus on unsigned type */ + /* /W3 */ +# pragma warning(disable : 4018) /* signed/unsigned mismatch */ +# pragma warning(disable : 4244) /* possible loss of data */ +# pragma warning(disable : 4267) /* possible loss of precision */ +# pragma warning(disable : 4310) /* cast truncates constant value */ + /* /W4 */ +# pragma warning(disable : 4100) /* unreferenced formal parameter */ +# pragma warning(disable : 4127) /* conditional expression is constant */ +# pragma warning(disable : 4189) /* local variable initialized but not referenced */ +# pragma warning(disable : 4232) /* nonstandard extension used */ +# pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */ +# pragma warning(disable : 4295) /* array too small to include terminating null */ #endif #ifndef FREESTANDING # include /* for memcpy() */ #endif +/* ========================================================================== */ +/* Target architecture */ +/* ========================================================================== */ + +/* If possible, define a compiler-independent ARCH_* macro. */ +#undef ARCH_X86_64 +#undef ARCH_X86_32 +#undef ARCH_ARM64 +#undef ARCH_ARM32 +#ifdef _MSC_VER +# if defined(_M_X64) +# define ARCH_X86_64 +# elif defined(_M_IX86) +# define ARCH_X86_32 +# elif defined(_M_ARM64) +# define ARCH_ARM64 +# elif defined(_M_ARM) +# define ARCH_ARM32 +# endif +#else +# if defined(__x86_64__) +# define ARCH_X86_64 +# elif defined(__i386__) +# define ARCH_X86_32 +# elif defined(__aarch64__) +# define ARCH_ARM64 +# elif defined(__arm__) +# define ARCH_ARM32 +# endif +#endif + /* ========================================================================== */ /* Type definitions */ /* ========================================================================== */ @@ -111,22 +160,13 @@ typedef size_t machine_word_t; # define __has_builtin(builtin) 0 #endif -/* LIBEXPORT - export a function from a shared library */ -#ifdef _WIN32 -# define LIBEXPORT __declspec(dllexport) -#elif defined(__GNUC__) -# define LIBEXPORT __attribute__((visibility("default"))) -#else -# define LIBEXPORT -#endif - /* inline - suggest that a function be inlined */ #ifdef _MSC_VER # define inline __inline #endif /* else assume 'inline' is usable as-is */ /* forceinline - force a function to be inlined, if possible */ -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_attribute(always_inline) # define forceinline inline __attribute__((always_inline)) #elif defined(_MSC_VER) # define forceinline __forceinline @@ -135,54 +175,71 @@ typedef size_t machine_word_t; #endif /* MAYBE_UNUSED - mark a function or variable as maybe unused */ -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_attribute(unused) # define MAYBE_UNUSED __attribute__((unused)) #else # define MAYBE_UNUSED #endif -/* restrict - hint that writes only occur through the given pointer */ -#ifdef __GNUC__ -# define restrict __restrict__ -#elif defined(_MSC_VER) - /* - * Don't use MSVC's __restrict; it has nonstandard behavior. - * Standard restrict is okay, if it is supported. - */ -# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) -# define restrict restrict +/* + * restrict - hint that writes only occur through the given pointer. + * + * Don't use MSVC's __restrict, since it has nonstandard behavior. + * Standard restrict is okay, if it is supported. + */ +#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L) +# if defined(__GNUC__) || defined(__clang__) +# define restrict __restrict__ # else # define restrict # endif -#else -# define restrict -#endif +#endif /* else assume 'restrict' is usable as-is */ /* likely(expr) - hint that an expression is usually true */ -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) # define likely(expr) __builtin_expect(!!(expr), 1) #else # define likely(expr) (expr) #endif /* unlikely(expr) - hint that an expression is usually false */ -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) # define unlikely(expr) __builtin_expect(!!(expr), 0) #else # define unlikely(expr) (expr) #endif /* prefetchr(addr) - prefetch into L1 cache for read */ -#ifdef __GNUC__ +#undef prefetchr +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) # define prefetchr(addr) __builtin_prefetch((addr), 0) -#else +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchr(addr) _mm_prefetch((addr), _MM_HINT_T0) +# elif defined(ARCH_ARM64) +# define prefetchr(addr) __prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchr(addr) __prefetch(addr) +# endif +#endif +#ifndef prefetchr # define prefetchr(addr) #endif /* prefetchw(addr) - prefetch into L1 cache for write */ -#ifdef __GNUC__ +#undef prefetchw +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) # define prefetchw(addr) __builtin_prefetch((addr), 1) -#else +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchw(addr) _m_prefetchw(addr) +# elif defined(ARCH_ARM64) +# define prefetchw(addr) __prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchw(addr) __prefetchw(addr) +# endif +#endif +#ifndef prefetchw # define prefetchw(addr) #endif @@ -191,13 +248,28 @@ typedef size_t machine_word_t; * the annotated type, must be aligned on n-byte boundaries. */ #undef _aligned_attribute -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_attribute(aligned) # define _aligned_attribute(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) +# define _aligned_attribute(n) __declspec(align(n)) #endif -/* Does the compiler support the 'target' function attribute? */ -#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \ - (GCC_PREREQ(4, 4) || __has_attribute(target)) +/* + * _target_attribute(attrs) - override the compilation target for a function. + * + * This accepts one or more comma-separated suffixes to the -m prefix jointly + * forming the name of a machine-dependent option. On gcc-like compilers, this + * enables codegen for the given targets, including arbitrary compiler-generated + * code as well as the corresponding intrinsics. On other compilers this macro + * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway. + */ +#if GCC_PREREQ(4, 4) || __has_attribute(target) +# define _target_attribute(attrs) __attribute__((target(attrs))) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 1 +#else +# define _target_attribute(attrs) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 +#endif /* ========================================================================== */ /* Miscellaneous macros */ @@ -299,8 +371,8 @@ static forceinline u64 bswap64(u64 v) * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed * efficiently on the target platform, otherwise 0. */ -#if defined(__GNUC__) && \ - (defined(__x86_64__) || defined(__i386__) || \ +#if (defined(__GNUC__) || defined(__clang__)) && \ + (defined(ARCH_X86_64) || defined(ARCH_X86_32) || \ defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \ /* * For all compilation purposes, WebAssembly behaves like any other CPU @@ -520,7 +592,7 @@ put_unaligned_leword(machine_word_t v, u8 *p) static forceinline unsigned bsr32(u32 v) { -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_builtin(__builtin_clz) return 31 - __builtin_clz(v); #elif defined(_MSC_VER) unsigned long i; @@ -539,7 +611,7 @@ bsr32(u32 v) static forceinline unsigned bsr64(u64 v) { -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_builtin(__builtin_clzll) return 63 - __builtin_clzll(v); #elif defined(_MSC_VER) && defined(_WIN64) unsigned long i; @@ -574,7 +646,7 @@ bsrw(machine_word_t v) static forceinline unsigned bsf32(u32 v) { -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_builtin(__builtin_ctz) return __builtin_ctz(v); #elif defined(_MSC_VER) unsigned long i; @@ -593,7 +665,7 @@ bsf32(u32 v) static forceinline unsigned bsf64(u64 v) { -#ifdef __GNUC__ +#if defined(__GNUC__) || __has_builtin(__builtin_ctzll) return __builtin_ctzll(v); #elif defined(_MSC_VER) && defined(_WIN64) unsigned long i; @@ -624,7 +696,7 @@ bsfw(machine_word_t v) * fallback implementation; use '#ifdef rbit32' to check if this is available. */ #undef rbit32 -#if defined(__GNUC__) && defined(__arm__) && \ +#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \ (__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__))) static forceinline u32 rbit32(u32 v) @@ -633,7 +705,7 @@ rbit32(u32 v) return v; } #define rbit32 rbit32 -#elif defined(__GNUC__) && defined(__aarch64__) +#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64) static forceinline u32 rbit32(u32 v) { diff --git a/2.0/libdeflate/lib/adler32.c b/2.0/libdeflate/lib/adler32.c index a7f4a3391..3a526ac9d 100644 --- a/2.0/libdeflate/lib/adler32.c +++ b/2.0/libdeflate/lib/adler32.c @@ -26,7 +26,6 @@ */ #include "lib_common.h" -#include "libdeflate.h" /* The Adler-32 divisor, or "base", value */ #define DIVISOR 65521 @@ -91,9 +90,9 @@ adler32_generic(u32 adler, const u8 *p, size_t len) #undef DEFAULT_IMPL #undef arch_select_adler32_func typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len); -#if defined(__arm__) || defined(__aarch64__) +#if defined(ARCH_ARM32) || defined(ARCH_ARM64) # include "arm/adler32_impl.h" -#elif defined(__i386__) || defined(__x86_64__) +#elif defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/adler32_impl.h" #endif @@ -122,7 +121,7 @@ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len) #define adler32_impl DEFAULT_IMPL #endif -LIBDEFLATEEXPORT u32 LIBDEFLATEAPI +LIBDEFLATEAPI u32 libdeflate_adler32(u32 adler, const void *buffer, size_t len) { if (buffer == NULL) /* Return initial value. */ diff --git a/2.0/libdeflate/lib/arm/adler32_impl.h b/2.0/libdeflate/lib/arm/adler32_impl.h index e0589fe14..4083b2ef3 100644 --- a/2.0/libdeflate/lib/arm/adler32_impl.h +++ b/2.0/libdeflate/lib/arm/adler32_impl.h @@ -42,10 +42,10 @@ # if HAVE_NEON_NATIVE # define ATTRIBUTES # else -# ifdef __arm__ -# define ATTRIBUTES __attribute__((target("fpu=neon"))) +# ifdef ARCH_ARM32 +# define ATTRIBUTES _target_attribute("fpu=neon") # else -# define ATTRIBUTES __attribute__((target("+simd"))) +# define ATTRIBUTES _target_attribute("+simd") # endif # endif # include @@ -53,29 +53,35 @@ static forceinline ATTRIBUTES void adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, u32 *s1, u32 *s2) { - const uint16x8_t mults_a = { 64, 63, 62, 61, 60, 59, 58, 57, }; - const uint16x8_t mults_b = { 56, 55, 54, 53, 52, 51, 50, 49, }; - const uint16x8_t mults_c = { 48, 47, 46, 45, 44, 43, 42, 41, }; - const uint16x8_t mults_d = { 40, 39, 38, 37, 36, 35, 34, 33, }; - const uint16x8_t mults_e = { 32, 31, 30, 29, 28, 27, 26, 25, }; - const uint16x8_t mults_f = { 24, 23, 22, 21, 20, 19, 18, 17, }; - const uint16x8_t mults_g = { 16, 15, 14, 13, 12, 11, 10, 9, }; - const uint16x8_t mults_h = { 8, 7, 6, 5, 4, 3, 2, 1, }; + static const u16 _aligned_attribute(16) mults[64] = { + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const uint16x8_t mults_a = vld1q_u16(&mults[0]); + const uint16x8_t mults_b = vld1q_u16(&mults[8]); + const uint16x8_t mults_c = vld1q_u16(&mults[16]); + const uint16x8_t mults_d = vld1q_u16(&mults[24]); + const uint16x8_t mults_e = vld1q_u16(&mults[32]); + const uint16x8_t mults_f = vld1q_u16(&mults[40]); + const uint16x8_t mults_g = vld1q_u16(&mults[48]); + const uint16x8_t mults_h = vld1q_u16(&mults[56]); - uint32x4_t v_s1 = { 0, 0, 0, 0 }; - uint32x4_t v_s2 = { 0, 0, 0, 0 }; + uint32x4_t v_s1 = vdupq_n_u32(0); + uint32x4_t v_s2 = vdupq_n_u32(0); /* * v_byte_sums_* contain the sum of the bytes at index i across all * 64-byte segments, for each index 0..63. */ - uint16x8_t v_byte_sums_a = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_b = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_c = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_d = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_e = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_f = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_g = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_h = { 0, 0, 0, 0, 0, 0, 0, 0 }; + uint16x8_t v_byte_sums_a = vdupq_n_u16(0); + uint16x8_t v_byte_sums_b = vdupq_n_u16(0); + uint16x8_t v_byte_sums_c = vdupq_n_u16(0); + uint16x8_t v_byte_sums_d = vdupq_n_u16(0); + uint16x8_t v_byte_sums_e = vdupq_n_u16(0); + uint16x8_t v_byte_sums_f = vdupq_n_u16(0); + uint16x8_t v_byte_sums_g = vdupq_n_u16(0); + uint16x8_t v_byte_sums_h = vdupq_n_u16(0); do { /* Load the next 64 bytes. */ @@ -89,7 +95,7 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, * Accumulate the previous s1 counters into the s2 counters. * The needed multiplication by 64 is delayed to later. */ - v_s2 += v_s1; + v_s2 = vaddq_u32(v_s2, v_s1); /* * Add the 64 bytes to their corresponding v_byte_sums counters, @@ -113,7 +119,7 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, } while (p != end); /* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */ -#ifdef __arm__ +#ifdef ARCH_ARM32 # define umlal2(a, b, c) vmlal_u16((a), vget_high_u16(b), vget_high_u16(c)) #else # define umlal2 vmlal_high_u16 @@ -138,8 +144,15 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, #undef umlal2 /* Horizontal sum to finish up */ - *s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3]; - *s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3]; +#ifdef ARCH_ARM32 + *s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) + + vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3); + *s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) + + vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3); +#else + *s1 += vaddvq_u32(v_s1); + *s2 += vaddvq_u32(v_s2); +#endif } # include "../adler32_vec_template.h" #endif /* Regular NEON implementation */ @@ -156,16 +169,16 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, # define ATTRIBUTES # else # ifdef __clang__ -# define ATTRIBUTES __attribute__((target("dotprod"))) +# define ATTRIBUTES _target_attribute("dotprod") /* * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the * default target is armv8.3-a or later in which case it must be omitted. * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. */ # elif defined(__ARM_FEATURE_JCVT) -# define ATTRIBUTES __attribute__((target("+dotprod"))) +# define ATTRIBUTES _target_attribute("+dotprod") # else -# define ATTRIBUTES __attribute__((target("arch=armv8.2-a+dotprod"))) +# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod") # endif # endif # include @@ -173,35 +186,32 @@ static forceinline ATTRIBUTES void adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end, u32 *s1, u32 *s2) { - const uint8x16_t mults_a = { + static const u8 _aligned_attribute(16) mults[64] = { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - }; - const uint8x16_t mults_b = { 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, - }; - const uint8x16_t mults_c = { 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - }; - const uint8x16_t mults_d = { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, }; - const uint8x16_t ones = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1, - }; - uint32x4_t v_s1_a = { 0, 0, 0, 0 }; - uint32x4_t v_s1_b = { 0, 0, 0, 0 }; - uint32x4_t v_s1_c = { 0, 0, 0, 0 }; - uint32x4_t v_s1_d = { 0, 0, 0, 0 }; - uint32x4_t v_s2_a = { 0, 0, 0, 0 }; - uint32x4_t v_s2_b = { 0, 0, 0, 0 }; - uint32x4_t v_s2_c = { 0, 0, 0, 0 }; - uint32x4_t v_s2_d = { 0, 0, 0, 0 }; - uint32x4_t v_s1_sums_a = { 0, 0, 0, 0 }; - uint32x4_t v_s1_sums_b = { 0, 0, 0, 0 }; - uint32x4_t v_s1_sums_c = { 0, 0, 0, 0 }; - uint32x4_t v_s1_sums_d = { 0, 0, 0, 0 }; + const uint8x16_t mults_a = vld1q_u8(&mults[0]); + const uint8x16_t mults_b = vld1q_u8(&mults[16]); + const uint8x16_t mults_c = vld1q_u8(&mults[32]); + const uint8x16_t mults_d = vld1q_u8(&mults[48]); + const uint8x16_t ones = vdupq_n_u8(1); + uint32x4_t v_s1_a = vdupq_n_u32(0); + uint32x4_t v_s1_b = vdupq_n_u32(0); + uint32x4_t v_s1_c = vdupq_n_u32(0); + uint32x4_t v_s1_d = vdupq_n_u32(0); + uint32x4_t v_s2_a = vdupq_n_u32(0); + uint32x4_t v_s2_b = vdupq_n_u32(0); + uint32x4_t v_s2_c = vdupq_n_u32(0); + uint32x4_t v_s2_d = vdupq_n_u32(0); + uint32x4_t v_s1_sums_a = vdupq_n_u32(0); + uint32x4_t v_s1_sums_b = vdupq_n_u32(0); + uint32x4_t v_s1_sums_c = vdupq_n_u32(0); + uint32x4_t v_s1_sums_d = vdupq_n_u32(0); uint32x4_t v_s1; uint32x4_t v_s2; + uint32x4_t v_s1_sums; do { uint8x16_t bytes_a = *p++; @@ -209,29 +219,31 @@ adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end, uint8x16_t bytes_c = *p++; uint8x16_t bytes_d = *p++; - v_s1_sums_a += v_s1_a; + v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a); v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones); v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a); - v_s1_sums_b += v_s1_b; + v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b); v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones); v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b); - v_s1_sums_c += v_s1_c; + v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c); v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones); v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c); - v_s1_sums_d += v_s1_d; + v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d); v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones); v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d); } while (p != end); - v_s1 = v_s1_a + v_s1_b + v_s1_c + v_s1_d; - v_s2 = v_s2_a + v_s2_b + v_s2_c + v_s2_d + - vqshlq_n_u32(v_s1_sums_a + v_s1_sums_b + - v_s1_sums_c + v_s1_sums_d, 6); - *s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3]; - *s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3]; + v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), vaddq_u32(v_s1_c, v_s1_d)); + v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), vaddq_u32(v_s2_c, v_s2_d)); + v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, v_s1_sums_b), + vaddq_u32(v_s1_sums_c, v_s1_sums_d)); + v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6)); + + *s1 += vaddvq_u32(v_s1); + *s2 += vaddvq_u32(v_s2); } # include "../adler32_vec_template.h" #endif /* NEON+dotprod implementation */ diff --git a/2.0/libdeflate/lib/arm/arm_cpu_features.c b/2.0/libdeflate/lib/arm/arm_cpu_features.c index 98f881dec..72ab03da3 100644 --- a/2.0/libdeflate/lib/arm/arm_cpu_features.c +++ b/2.0/libdeflate/lib/arm/arm_cpu_features.c @@ -31,8 +31,9 @@ */ #ifdef __APPLE__ -#undef _ANSI_SOURCE -#define _DARWIN_C_SOURCE /* for sysctlbyname() */ +# undef _ANSI_SOURCE +# undef _DARWIN_C_SOURCE +# define _DARWIN_C_SOURCE /* for sysctlbyname() */ #endif #include "../cpu_features_common.h" /* must be included first */ @@ -79,7 +80,7 @@ static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) goto out; } filled += ret; - } while (filled < (long)(2 * sizeof(long))); + } while (filled < 2 * sizeof(long)); i = 0; do { @@ -92,7 +93,7 @@ static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) *hwcap2 = value; i += 2; filled -= 2 * sizeof(long); - } while (filled >= (long)(2 * sizeof(long))); + } while (filled >= 2 * sizeof(long)); memmove(auxbuf, &auxbuf[i], filled); } @@ -108,7 +109,7 @@ static u32 query_arm_cpu_features(void) scan_auxv(&hwcap, &hwcap2); -#ifdef __arm__ +#ifdef ARCH_ARM32 STATIC_ASSERT(sizeof(long) == 4); if (hwcap & (1 << 12)) /* HWCAP_NEON */ features |= ARM_CPU_FEATURE_NEON; @@ -167,6 +168,23 @@ static u32 query_arm_cpu_features(void) } return features; } +#elif defined(_WIN32) + +#include + +static u32 query_arm_cpu_features(void) +{ + u32 features = ARM_CPU_FEATURE_NEON; + + if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) + features |= ARM_CPU_FEATURE_PMULL; + if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) + features |= ARM_CPU_FEATURE_CRC32; + + /* FIXME: detect SHA3 and DOTPROD support too. */ + + return features; +} #else #error "unhandled case" #endif diff --git a/2.0/libdeflate/lib/arm/cpu_features.h b/2.0/libdeflate/lib/arm/cpu_features.h index c6dfe86d3..c55f007cf 100644 --- a/2.0/libdeflate/lib/arm/cpu_features.h +++ b/2.0/libdeflate/lib/arm/cpu_features.h @@ -32,12 +32,13 @@ #define HAVE_DYNAMIC_ARM_CPU_FEATURES 0 -#if defined(__arm__) || defined(__aarch64__) +#if defined(ARCH_ARM32) || defined(ARCH_ARM64) -#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \ - !defined(FREESTANDING) && \ - (defined(__linux__) || \ - (defined(__aarch64__) && defined(__APPLE__))) +#if !defined(FREESTANDING) && \ + (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \ + (defined(__linux__) || \ + (defined(__APPLE__) && defined(ARCH_ARM64)) || \ + (defined(_WIN32) && defined(ARCH_ARM64))) # undef HAVE_DYNAMIC_ARM_CPU_FEATURES # define HAVE_DYNAMIC_ARM_CPU_FEATURES 1 #endif @@ -71,19 +72,18 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */ /* NEON */ -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || defined(ARCH_ARM64) # define HAVE_NEON_NATIVE 1 #else # define HAVE_NEON_NATIVE 0 #endif -#define HAVE_NEON_TARGET HAVE_DYNAMIC_ARM_CPU_FEATURES /* * With both gcc and clang, NEON intrinsics require that the main target has * NEON enabled already. Exception: with gcc 6.1 and later (r230411 for arm32, * r226563 for arm64), hardware floating point support is sufficient. */ #if HAVE_NEON_NATIVE || \ - (HAVE_NEON_TARGET && GCC_PREREQ(6, 1) && defined(__ARM_FP)) + (HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP)) # define HAVE_NEON_INTRIN 1 #else # define HAVE_NEON_INTRIN 0 @@ -95,20 +95,50 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_PMULL_NATIVE 0 #endif -#define HAVE_PMULL_TARGET \ +#if HAVE_PMULL_NATIVE || \ (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(6, 1) || __has_builtin(__builtin_neon_vmull_p64))) + HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \ + (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \ + defined(_MSC_VER)) && \ + /* + * On arm32 with clang, the crypto intrinsics (which include pmull) + * are not defined, even when using -mfpu=crypto-neon-fp-armv8, + * because clang's puts their definitions behind + * __aarch64__. + */ \ + !(defined(ARCH_ARM32) && defined(__clang__))) +# define HAVE_PMULL_INTRIN CPU_IS_LITTLE_ENDIAN() /* untested on big endian */ + /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */ +# ifdef _MSC_VER +# define compat_vmull_p64(a, b) vmull_p64(vcreate_p64(a), vcreate_p64(b)) +# else +# define compat_vmull_p64(a, b) vmull_p64((a), (b)) +# endif +#else +# define HAVE_PMULL_INTRIN 0 +#endif /* - * On arm32 with clang, the crypto intrinsics (which include pmull) are not - * defined, even when using -mfpu=crypto-neon-fp-armv8, because clang's - * puts their definitions behind __aarch64__. + * Set USE_PMULL_TARGET_EVEN_IF_NATIVE if a workaround for a gcc bug that was + * fixed by commit 11a113d501ff ("aarch64: Simplify feature definitions") in gcc + * 13 is needed. A minimal program that fails to build due to this bug when + * compiled with -mcpu=emag, at least with gcc 10 through 12, is: + * + * static inline __attribute__((always_inline,target("+crypto"))) void f() {} + * void g() { f(); } + * + * The error is: + * + * error: inlining failed in call to ‘always_inline’ ‘f’: target specific option mismatch + * + * The workaround is to explicitly add the crypto target to the non-inline + * function g(), even though this should not be required due to -mcpu=emag + * enabling 'crypto' natively and causing __ARM_FEATURE_CRYPTO to be defined. */ -#if HAVE_NEON_INTRIN && (HAVE_PMULL_NATIVE || HAVE_PMULL_TARGET) && \ - !(defined(__arm__) && defined(__clang__)) && \ - CPU_IS_LITTLE_ENDIAN() /* pmull code on big endian is untested */ -# define HAVE_PMULL_INTRIN 1 +#if HAVE_PMULL_NATIVE && defined(ARCH_ARM64) && \ + GCC_PREREQ(6, 1) && !GCC_PREREQ(13, 1) +# define USE_PMULL_TARGET_EVEN_IF_NATIVE 1 #else -# define HAVE_PMULL_INTRIN 0 +# define USE_PMULL_TARGET_EVEN_IF_NATIVE 0 #endif /* CRC32 */ @@ -117,30 +147,50 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_CRC32_NATIVE 0 #endif -#define HAVE_CRC32_TARGET \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(4, 9) || __has_builtin(__builtin_arm_crc32b))) -/* - * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled in - * the main target has been affected by two gcc bugs, which we must avoid by - * only allowing gcc versions that have the corresponding fixes. First, gcc - * commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a and - * hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed. Second, gcc - * commit c1cdabe3aab8 ("arm: reorder assembler architecture directives - * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when binutils is - * 2.34 or later, due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439. - * We use the second set of prerequisites, as they are stricter and we have no - * way to detect the binutils version directly from a C source file. - */ -#define HAVE_CRC32_INTRIN \ - (HAVE_CRC32_NATIVE || (HAVE_CRC32_TARGET && \ - (!GCC_PREREQ(1, 0) || \ - GCC_PREREQ(11, 3) || \ - (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \ - (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))))) +#undef HAVE_CRC32_INTRIN +#if HAVE_CRC32_NATIVE +# define HAVE_CRC32_INTRIN 1 +#elif HAVE_DYNAMIC_ARM_CPU_FEATURES +# if GCC_PREREQ(1, 0) + /* + * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled + * in the main target has been affected by two gcc bugs, which we must avoid + * by only allowing gcc versions that have the corresponding fixes. First, + * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a + * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed. Second, + * gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture directives + * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when + * binutils is 2.34 or later, due to + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439. We use the second + * set of prerequisites, as they are stricter and we have no way to detect + * the binutils version directly from a C source file. + * + * Also exclude the cases where the main target arch is armv6kz or armv7e-m. + * In those cases, gcc doesn't let functions that use the main arch be + * inlined into functions that are targeted to armv8-a+crc. (armv8-a is + * necessary for crc to be accepted at all.) That causes build errors. + * This issue happens for these specific sub-archs because they are not a + * subset of armv8-a. Note: clang does not have this limitation. + */ +# if (GCC_PREREQ(11, 3) || \ + (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \ + (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))) && \ + !defined(__ARM_ARCH_6KZ__) && \ + !defined(__ARM_ARCH_7EM__) +# define HAVE_CRC32_INTRIN 1 +# endif +# elif CLANG_PREREQ(3, 4, 6000000) +# define HAVE_CRC32_INTRIN 1 +# elif defined(_MSC_VER) +# define HAVE_CRC32_INTRIN 1 +# endif +#endif +#ifndef HAVE_CRC32_INTRIN +# define HAVE_CRC32_INTRIN 0 +#endif /* SHA3 (needed for the eor3 instruction) */ -#ifdef __aarch64__ +#if defined(ARCH_ARM64) && !defined(_MSC_VER) # ifdef __ARM_FEATURE_SHA3 # define HAVE_SHA3_NATIVE 1 # else @@ -149,9 +199,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; } # define HAVE_SHA3_TARGET (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ (GCC_PREREQ(8, 1) /* r256478 */ || \ CLANG_PREREQ(7, 0, 10010463) /* r338010 */)) -# define HAVE_SHA3_INTRIN ((HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \ +# define HAVE_SHA3_INTRIN (HAVE_NEON_INTRIN && \ + (HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \ (GCC_PREREQ(9, 1) /* r268049 */ || \ - __has_builtin(__builtin_neon_veor3q_v))) + CLANG_PREREQ(13, 0, 13160000))) #else # define HAVE_SHA3_NATIVE 0 # define HAVE_SHA3_TARGET 0 @@ -159,20 +210,22 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #endif /* dotprod */ -#ifdef __aarch64__ +#ifdef ARCH_ARM64 # ifdef __ARM_FEATURE_DOTPROD # define HAVE_DOTPROD_NATIVE 1 # else # define HAVE_DOTPROD_NATIVE 0 # endif -# define HAVE_DOTPROD_TARGET \ +# if HAVE_DOTPROD_NATIVE || \ (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(8, 1) || __has_builtin(__builtin_neon_vdotq_v))) -# define HAVE_DOTPROD_INTRIN \ - (HAVE_NEON_INTRIN && (HAVE_DOTPROD_NATIVE || HAVE_DOTPROD_TARGET)) + (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || \ + defined(_MSC_VER))) +# define HAVE_DOTPROD_INTRIN 1 +# else +# define HAVE_DOTPROD_INTRIN 0 +# endif #else # define HAVE_DOTPROD_NATIVE 0 -# define HAVE_DOTPROD_TARGET 0 # define HAVE_DOTPROD_INTRIN 0 #endif @@ -184,7 +237,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } * corresponding __ARM_FEATURE_* macros while including the headers. */ #if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ - (defined(__clang__) || defined(__arm__)) + (defined(__clang__) || defined(ARCH_ARM32)) # define __ARM_FEATURE_CRC32 1 #endif #if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) @@ -194,7 +247,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } # define __ARM_FEATURE_DOTPROD 1 #endif #if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ - (defined(__clang__) || defined(__arm__)) + (defined(__clang__) || defined(ARCH_ARM32)) # include # undef __ARM_FEATURE_CRC32 #endif @@ -207,6 +260,6 @@ static inline u32 get_arm_cpu_features(void) { return 0; } # undef __ARM_FEATURE_DOTPROD #endif -#endif /* __arm__ || __aarch64__ */ +#endif /* ARCH_ARM32 || ARCH_ARM64 */ #endif /* LIB_ARM_CPU_FEATURES_H */ diff --git a/2.0/libdeflate/lib/arm/crc32_impl.h b/2.0/libdeflate/lib/arm/crc32_impl.h index 0db6354d9..c802cdf03 100644 --- a/2.0/libdeflate/lib/arm/crc32_impl.h +++ b/2.0/libdeflate/lib/arm/crc32_impl.h @@ -48,22 +48,31 @@ # if HAVE_CRC32_NATIVE # define ATTRIBUTES # else -# ifdef __arm__ +# ifdef ARCH_ARM32 # ifdef __clang__ -# define ATTRIBUTES __attribute__((target("armv8-a,crc"))) +# define ATTRIBUTES _target_attribute("armv8-a,crc") +# elif defined(__ARM_PCS_VFP) + /* + * +simd is needed to avoid a "selected architecture lacks an FPU" + * error with Debian arm-linux-gnueabihf-gcc when -mfpu is not + * explicitly specified on the command line. + */ +# define ATTRIBUTES _target_attribute("arch=armv8-a+crc+simd") # else -# define ATTRIBUTES __attribute__((target("arch=armv8-a+crc"))) +# define ATTRIBUTES _target_attribute("arch=armv8-a+crc") # endif # else # ifdef __clang__ -# define ATTRIBUTES __attribute__((target("crc"))) +# define ATTRIBUTES _target_attribute("crc") # else -# define ATTRIBUTES __attribute__((target("+crc"))) +# define ATTRIBUTES _target_attribute("+crc") # endif # endif # endif -#include +#ifndef _MSC_VER +# include +#endif /* * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN @@ -233,26 +242,36 @@ crc32_arm_crc(u32 crc, const u8 *p, size_t len) * checksummed chunks, not for folding the data itself. See crc32_arm_pmull*() * for implementations that use pmull for folding the data itself. */ -#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN && \ - ((HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE) || \ - (HAVE_CRC32_TARGET && HAVE_PMULL_TARGET)) -# if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE +#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN +# if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE # define ATTRIBUTES # else -# ifdef __arm__ -# define ATTRIBUTES __attribute__((target("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8"))) +# ifdef ARCH_ARM32 +# define ATTRIBUTES _target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8") # else # ifdef __clang__ -# define ATTRIBUTES __attribute__((target("crc,crypto"))) +# define ATTRIBUTES _target_attribute("crc,aes") # else -# define ATTRIBUTES __attribute__((target("+crc,+crypto"))) +# define ATTRIBUTES _target_attribute("+crc,+crypto") # endif # endif # endif -#include +#ifndef _MSC_VER +# include +#endif #include +/* Do carryless multiplication of two 32-bit values. */ +static forceinline ATTRIBUTES u64 +clmul_u32(u32 a, u32 b) +{ + uint64x2_t res = vreinterpretq_u64_p128( + compat_vmull_p64((poly64_t)a, (poly64_t)b)); + + return vgetq_lane_u64(res, 0); +} + /* * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more * quickly, and supports a variable chunk length. The chunk length is @@ -262,9 +281,9 @@ crc32_arm_crc(u32 crc, const u8 *p, size_t len) static forceinline ATTRIBUTES u32 combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i) { - u64 res0 = vmull_p64(crc0, crc32_mults_for_chunklen[i][0]); - u64 res1 = vmull_p64(crc1, crc32_mults_for_chunklen[i][1]); - u64 res2 = vmull_p64(crc2, crc32_mults_for_chunklen[i][2]); + u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]); + u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]); + u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]); return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3; } @@ -426,16 +445,25 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) #if HAVE_PMULL_INTRIN # define crc32_arm_pmullx4 crc32_arm_pmullx4 # define SUFFIX _pmullx4 -# if HAVE_PMULL_NATIVE +# if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE # define ATTRIBUTES # else -# ifdef __arm__ -# define ATTRIBUTES __attribute__((target("fpu=crypto-neon-fp-armv8"))) +# ifdef ARCH_ARM32 +# define ATTRIBUTES _target_attribute("fpu=crypto-neon-fp-armv8") # else # ifdef __clang__ -# define ATTRIBUTES __attribute__((target("crypto"))) + /* + * This used to use "crypto", but that stopped working with clang 16. + * Now only "aes" works. "aes" works with older versions too, so use + * that. No "+" prefix; clang 15 and earlier doesn't accept that. + */ +# define ATTRIBUTES _target_attribute("aes") # else -# define ATTRIBUTES __attribute__((target("+crypto"))) + /* + * With gcc, only "+crypto" works. Both the "+" prefix and the + * "crypto" (not "aes") are essential... + */ +# define ATTRIBUTES _target_attribute("+crypto") # endif # endif # endif @@ -445,17 +473,25 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) static u32 ATTRIBUTES MAYBE_UNUSED crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) { - const poly64x2_t multipliers_4 = (poly64x2_t)CRC32_4VECS_MULTS; - const poly64x2_t multipliers_2 = (poly64x2_t)CRC32_2VECS_MULTS; - const poly64x2_t multipliers_1 = (poly64x2_t)CRC32_1VECS_MULTS; - const uint8x16_t zeroes = (uint8x16_t){ 0 }; - const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF }; + static const u64 _aligned_attribute(16) mults[3][2] = { + CRC32_1VECS_MULTS, + CRC32_4VECS_MULTS, + CRC32_2VECS_MULTS, + }; + static const u64 _aligned_attribute(16) final_mults[3][2] = { + { CRC32_FINAL_MULT, 0 }, + { CRC32_BARRETT_CONSTANT_1, 0 }, + { CRC32_BARRETT_CONSTANT_2, 0 }, + }; + const uint8x16_t zeroes = vdupq_n_u8(0); + const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF)); + const poly64x2_t multipliers_1 = load_multipliers(mults[0]); uint8x16_t v0, v1, v2, v3; if (len < 64 + 15) { if (len < 16) return crc32_slice1(crc, p, len); - v0 = vld1q_u8(p) ^ (uint8x16_t)(uint32x4_t){ crc }; + v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc)); p += 16; len -= 16; while (len >= 16) { @@ -464,10 +500,12 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) len -= 16; } } else { + const poly64x2_t multipliers_4 = load_multipliers(mults[1]); + const poly64x2_t multipliers_2 = load_multipliers(mults[2]); const size_t align = -(uintptr_t)p & 15; const uint8x16_t *vp; - v0 = vld1q_u8(p) ^ (uint8x16_t)(uint32x4_t){ crc }; + v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc)); p += 16; /* Align p to the next 16-byte boundary. */ if (align) { @@ -508,21 +546,19 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * which is equivalent to multiplying by x^32. This is needed because * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). */ - v0 = vextq_u8(v0, zeroes, 8) ^ - (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v0), - CRC32_1VECS_MULT_2); + + v0 = veorq_u8(vextq_u8(v0, zeroes, 8), + clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1)); /* Fold 96 => 64 bits. */ - v0 = vextq_u8(v0, zeroes, 4) ^ - (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v0 & mask32), - CRC32_FINAL_MULT); + v0 = veorq_u8(vextq_u8(v0, zeroes, 4), + clmul_low(vandq_u8(v0, mask32), + load_multipliers(final_mults[0]))); /* Reduce 64 => 32 bits using Barrett reduction. */ - v1 = (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v0 & mask32), - CRC32_BARRETT_CONSTANT_1); - v1 = (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v1 & mask32), - CRC32_BARRETT_CONSTANT_2); - return ((uint32x4_t)(v0 ^ v1))[1]; + v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1])); + v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2])); + return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1); } #undef SUFFIX #undef ATTRIBUTES @@ -535,18 +571,16 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * * See crc32_pmull_wide.h for explanation. */ -#if defined(__aarch64__) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ - ((HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE) || \ - (HAVE_PMULL_TARGET && HAVE_CRC32_TARGET)) +#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN # define crc32_arm_pmullx12_crc crc32_arm_pmullx12_crc # define SUFFIX _pmullx12_crc -# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE +# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE # define ATTRIBUTES # else # ifdef __clang__ -# define ATTRIBUTES __attribute__((target("crypto,crc"))) +# define ATTRIBUTES _target_attribute("aes,crc") # else -# define ATTRIBUTES __attribute__((target("+crypto,+crc"))) +# define ATTRIBUTES _target_attribute("+crypto,+crc") # endif # endif # define ENABLE_EOR3 0 @@ -562,25 +596,25 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3. */ -#if defined(__aarch64__) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ - ((HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE) || \ - (HAVE_PMULL_TARGET && HAVE_CRC32_TARGET && HAVE_SHA3_TARGET)) +#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ + (HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE) # define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3 # define SUFFIX _pmullx12_crc_eor3 -# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE +# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE && \ + !USE_PMULL_TARGET_EVEN_IF_NATIVE # define ATTRIBUTES # else # ifdef __clang__ -# define ATTRIBUTES __attribute__((target("crypto,crc,sha3"))) +# define ATTRIBUTES _target_attribute("aes,crc,sha3") /* * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the * default target is armv8.3-a or later in which case it must be omitted. * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. */ # elif defined(__ARM_FEATURE_JCVT) -# define ATTRIBUTES __attribute__((target("+crypto,+crc,+sha3"))) +# define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3") # else -# define ATTRIBUTES __attribute__((target("arch=armv8.2-a+crypto+crc+sha3"))) +# define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3") # endif # endif # define ENABLE_EOR3 1 diff --git a/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h b/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h index 02dfc3ae0..1cd1cc188 100644 --- a/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h +++ b/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h @@ -39,21 +39,51 @@ #include +/* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */ +#undef u32_to_bytevec +static forceinline ATTRIBUTES uint8x16_t +ADD_SUFFIX(u32_to_bytevec)(u32 a) +{ + return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0)); +} +#define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec) + +/* Load two 64-bit values into a vector. */ +#undef load_multipliers +static forceinline ATTRIBUTES poly64x2_t +ADD_SUFFIX(load_multipliers)(const u64 p[2]) +{ + return vreinterpretq_p64_u64(vld1q_u64(p)); +} +#define load_multipliers ADD_SUFFIX(load_multipliers) + +/* Do carryless multiplication of the low halves of two vectors. */ +#undef clmul_low +static forceinline ATTRIBUTES uint8x16_t +ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b) +{ + return vreinterpretq_u8_p128( + compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0), + vgetq_lane_p64(b, 0))); +} +#define clmul_low ADD_SUFFIX(clmul_low) + +/* Do carryless multiplication of the high halves of two vectors. */ #undef clmul_high -static forceinline ATTRIBUTES poly128_t -ADD_SUFFIX(clmul_high)(poly64x2_t a, poly64x2_t b) +static forceinline ATTRIBUTES uint8x16_t +ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b) { -#if defined(__clang__) && defined(__aarch64__) +#if defined(__clang__) && defined(ARCH_ARM64) /* * Use inline asm to ensure that pmull2 is really used. This works * around clang bug https://github.com/llvm/llvm-project/issues/52868. */ - poly128_t res; + uint8x16_t res; __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b)); return res; #else - return vmull_high_p64(a, b); + return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b)); #endif } #define clmul_high ADD_SUFFIX(clmul_high) @@ -73,7 +103,7 @@ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c) return res; #endif #else /* ENABLE_EOR3 */ - return a ^ b ^ c; + return veorq_u8(veorq_u8(a, b), c); #endif /* !ENABLE_EOR3 */ } #define eor3 ADD_SUFFIX(eor3) @@ -82,15 +112,10 @@ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c) static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers) { - /* - * Using vget_low_* instead of vector indexing is necessary to avoid - * poor code generation with gcc on arm32. - */ - poly128_t a = vmull_p64((poly64_t)vget_low_u8(src), - (poly64_t)vget_low_p64(multipliers)); - poly128_t b = clmul_high((poly64x2_t)src, multipliers); + uint8x16_t a = clmul_low(src, multipliers); + uint8x16_t b = clmul_high(src, multipliers); - return eor3((uint8x16_t)a, (uint8x16_t)b, dst); + return eor3(a, b, dst); } #define fold_vec ADD_SUFFIX(fold_vec) @@ -98,7 +123,7 @@ ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers) static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices) { -#ifdef __aarch64__ +#ifdef ARCH_ARM64 return vqtbl1q_u8(table, indices); #else uint8x8x2_t tab2; @@ -144,7 +169,8 @@ ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len, x0 = vtbl(v, lshift); /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */ - bsl_mask = (uint8x16_t)vshrq_n_s8((int8x16_t)rshift, 7); + bsl_mask = vreinterpretq_u8_s8( + vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7)); /* * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len' diff --git a/2.0/libdeflate/lib/arm/crc32_pmull_wide.h b/2.0/libdeflate/lib/arm/crc32_pmull_wide.h index bf0d722a0..a72e1d876 100644 --- a/2.0/libdeflate/lib/arm/crc32_pmull_wide.h +++ b/2.0/libdeflate/lib/arm/crc32_pmull_wide.h @@ -45,7 +45,9 @@ * Apple M1 processor is an example of such a CPU. */ -#include +#ifndef _MSC_VER +# include +#endif #include #include "crc32_pmull_helpers.h" @@ -53,22 +55,24 @@ static u32 ATTRIBUTES MAYBE_UNUSED ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) { - const poly64x2_t multipliers_12 = (poly64x2_t)CRC32_12VECS_MULTS; - const poly64x2_t multipliers_6 = (poly64x2_t)CRC32_6VECS_MULTS; - const poly64x2_t multipliers_4 = (poly64x2_t)CRC32_4VECS_MULTS; - const poly64x2_t multipliers_3 = (poly64x2_t)CRC32_3VECS_MULTS; - const poly64x2_t multipliers_2 = (poly64x2_t)CRC32_2VECS_MULTS; - const poly64x2_t multipliers_1 = (poly64x2_t)CRC32_1VECS_MULTS; uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11; if (len < 3 * 192) { + static const u64 _aligned_attribute(16) mults[3][2] = { + CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS, + }; + poly64x2_t multipliers_4, multipliers_2, multipliers_1; + if (len < 64) goto tail; + multipliers_4 = load_multipliers(mults[0]); + multipliers_2 = load_multipliers(mults[1]); + multipliers_1 = load_multipliers(mults[2]); /* * Short length; don't bother aligning the pointer, and fold * 64 bytes (4 vectors) at a time, at most. */ - v0 = vld1q_u8(p + 0) ^ (uint8x16_t)(uint32x4_t){ crc }; + v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc)); v1 = vld1q_u8(p + 16); v2 = vld1q_u8(p + 32); v3 = vld1q_u8(p + 48); @@ -92,6 +96,14 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) } v0 = fold_vec(v0, v1, multipliers_1); } else { + static const u64 _aligned_attribute(16) mults[4][2] = { + CRC32_12VECS_MULTS, CRC32_6VECS_MULTS, + CRC32_3VECS_MULTS, CRC32_1VECS_MULTS, + }; + const poly64x2_t multipliers_12 = load_multipliers(mults[0]); + const poly64x2_t multipliers_6 = load_multipliers(mults[1]); + const poly64x2_t multipliers_3 = load_multipliers(mults[2]); + const poly64x2_t multipliers_1 = load_multipliers(mults[3]); const size_t align = -(uintptr_t)p & 15; const uint8x16_t *vp; @@ -114,7 +126,7 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) len -= align; } vp = (const uint8x16_t *)p; - v0 = *vp++ ^ (uint8x16_t)(uint32x4_t){ crc }; + v0 = veorq_u8(*vp++, u32_to_bytevec(crc)); v1 = *vp++; v2 = *vp++; v3 = *vp++; @@ -177,8 +189,8 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) p = (const u8 *)vp; } /* Reduce 128 to 32 bits using crc32 instructions. */ - crc = __crc32d(0, (u64)vget_low_u8(v0)); - crc = __crc32d(crc, (u64)vget_high_u8(v0)); + crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0)); + crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1)); tail: /* Finish up the remainder using crc32 instructions. */ if (len & 32) { diff --git a/2.0/libdeflate/lib/arm/matchfinder_impl.h b/2.0/libdeflate/lib/arm/matchfinder_impl.h index 4b10ba2f8..b20f56a3b 100644 --- a/2.0/libdeflate/lib/arm/matchfinder_impl.h +++ b/2.0/libdeflate/lib/arm/matchfinder_impl.h @@ -36,11 +36,7 @@ static forceinline void matchfinder_init_neon(mf_pos_t *data, size_t size) { int16x8_t *p = (int16x8_t *)data; - int16x8_t v = (int16x8_t) { - MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, - MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, - MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, - }; + int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL); STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); @@ -61,12 +57,7 @@ static forceinline void matchfinder_rebase_neon(mf_pos_t *data, size_t size) { int16x8_t *p = (int16x8_t *)data; - int16x8_t v = (int16x8_t) { - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - }; + int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE); STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); diff --git a/2.0/libdeflate/lib/bt_matchfinder.h b/2.0/libdeflate/lib/bt_matchfinder.h index d5b2dd561..b247d4bcc 100644 --- a/2.0/libdeflate/lib/bt_matchfinder.h +++ b/2.0/libdeflate/lib/bt_matchfinder.h @@ -85,7 +85,7 @@ struct lz_match { u16 offset; }; -struct bt_matchfinder { +struct MATCHFINDER_ALIGNED bt_matchfinder { /* The hash table for finding length 3 matches */ mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS]; @@ -98,8 +98,7 @@ struct bt_matchfinder { * children of the node for the sequence with position 'pos' are * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */ mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE]; - -} MATCHFINDER_ALIGNED; +}; /* Prepare the matchfinder for a new input buffer. */ static forceinline void diff --git a/2.0/libdeflate/lib/cpu_features_common.h b/2.0/libdeflate/lib/cpu_features_common.h index f23493816..3019ba28d 100644 --- a/2.0/libdeflate/lib/cpu_features_common.h +++ b/2.0/libdeflate/lib/cpu_features_common.h @@ -29,9 +29,11 @@ #define LIB_CPU_FEATURES_COMMON_H #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) -# undef _ANSI_SOURCE /* for strdup() and strtok_r() */ + /* for strdup() and strtok_r() */ +# undef _ANSI_SOURCE # ifndef __APPLE__ -# define _GNU_SOURCE 1 +# undef _GNU_SOURCE +# define _GNU_SOURCE # endif # include # include diff --git a/2.0/libdeflate/lib/crc32.c b/2.0/libdeflate/lib/crc32.c index 5b55c3dba..c3a4da48b 100644 --- a/2.0/libdeflate/lib/crc32.c +++ b/2.0/libdeflate/lib/crc32.c @@ -169,7 +169,6 @@ */ #include "lib_common.h" -#include "libdeflate.h" #include "crc32_multipliers.h" #include "crc32_tables.h" @@ -223,9 +222,9 @@ crc32_slice1(u32 crc, const u8 *p, size_t len) #undef DEFAULT_IMPL #undef arch_select_crc32_func typedef u32 (*crc32_func_t)(u32 crc, const u8 *p, size_t len); -#if defined(__arm__) || defined(__aarch64__) +#if defined(ARCH_ARM32) || defined(ARCH_ARM64) # include "arm/crc32_impl.h" -#elif defined(__i386__) || defined(__x86_64__) +#elif defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/crc32_impl.h" #endif @@ -254,7 +253,7 @@ static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len) #define crc32_impl DEFAULT_IMPL #endif -LIBDEFLATEEXPORT u32 LIBDEFLATEAPI +LIBDEFLATEAPI u32 libdeflate_crc32(u32 crc, const void *p, size_t len) { if (p == NULL) /* Return initial value. */ diff --git a/2.0/libdeflate/lib/crc32_table.h b/2.0/libdeflate/lib/crc32_table.h deleted file mode 100644 index 05421b982..000000000 --- a/2.0/libdeflate/lib/crc32_table.h +++ /dev/null @@ -1,526 +0,0 @@ -/* - * crc32_table.h - data table to accelerate CRC-32 computation - * - * THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c. DO NOT EDIT. - */ - -#include - -static const uint32_t crc32_table[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, - 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, - 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, - 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, - 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, - 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, - 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, - 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, - 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, - 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, - 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, - 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, - 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, - 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, - 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, - 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, - 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, - 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, - 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, - 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, - 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, - 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, - 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, - 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, - 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, - 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, - 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, - 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, - 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, - 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, - 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, - 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, - 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, - 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, - 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, -#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8) - 0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, - 0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7, - 0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb, - 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf, - 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192, - 0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, - 0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a, - 0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e, - 0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761, - 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265, - 0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, - 0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d, - 0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530, - 0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034, - 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38, - 0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, - 0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6, - 0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2, - 0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce, - 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca, - 0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, - 0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93, - 0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f, - 0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b, - 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864, - 0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, - 0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c, - 0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768, - 0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35, - 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31, - 0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, - 0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539, - 0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88, - 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c, - 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180, - 0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, - 0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9, - 0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd, - 0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1, - 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5, - 0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, - 0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e, - 0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522, - 0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026, - 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b, - 0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, - 0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773, - 0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277, - 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d, - 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189, - 0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, - 0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81, - 0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc, - 0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8, - 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4, - 0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, - 0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f, - 0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b, - 0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27, - 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23, - 0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, - 0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a, - 0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876, - 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72, - 0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, - 0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685, - 0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1, - 0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d, - 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29, - 0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, - 0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91, - 0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d, - 0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9, - 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065, - 0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, - 0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd, - 0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9, - 0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315, - 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71, - 0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, - 0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399, - 0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45, - 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221, - 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd, - 0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, - 0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835, - 0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151, - 0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d, - 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579, - 0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, - 0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1, - 0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d, - 0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609, - 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5, - 0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, - 0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, - 0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9, - 0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05, - 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461, - 0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, - 0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, - 0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75, - 0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711, - 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd, - 0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, - 0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5, - 0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281, - 0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d, - 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049, - 0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, - 0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, - 0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d, - 0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819, - 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5, - 0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, - 0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d, - 0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69, - 0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5, - 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1, - 0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, - 0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9, - 0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625, - 0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41, - 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d, - 0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, - 0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555, - 0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31, - 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed, - 0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, - 0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9, - 0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701, - 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056, - 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871, - 0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, - 0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e, - 0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9, - 0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0, - 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787, - 0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, - 0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68, - 0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f, - 0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018, - 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0, - 0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, - 0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3, - 0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084, - 0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c, - 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b, - 0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, - 0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b, - 0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3, - 0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4, - 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed, - 0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, - 0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002, - 0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755, - 0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72, - 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825, - 0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, - 0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca, - 0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5, - 0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82, - 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a, - 0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, - 0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a, - 0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d, - 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5, - 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2, - 0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, - 0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc, - 0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04, - 0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953, - 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174, - 0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, - 0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b, - 0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc, - 0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8, - 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf, - 0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, - 0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50, - 0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677, - 0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120, - 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98, - 0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, - 0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6, - 0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981, - 0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639, - 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e, - 0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, - 0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e, - 0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6, - 0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1, -#endif /* CRC32_SLICE4 || CRC32_SLICE8 */ -#if defined(CRC32_SLICE8) - 0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, - 0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10, - 0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111, - 0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1, - 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52, - 0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, - 0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693, - 0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053, - 0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4, - 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314, - 0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, - 0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5, - 0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256, - 0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496, - 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997, - 0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, - 0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299, - 0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459, - 0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958, - 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98, - 0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, - 0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db, - 0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda, - 0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a, - 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d, - 0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, - 0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c, - 0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c, - 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f, - 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf, - 0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, - 0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e, - 0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42, - 0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82, - 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183, - 0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, - 0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0, - 0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00, - 0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601, - 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1, - 0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, - 0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386, - 0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87, - 0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847, - 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4, - 0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, - 0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905, - 0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5, - 0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b, - 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb, - 0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, - 0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a, - 0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589, - 0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349, - 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48, - 0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, - 0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f, - 0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf, - 0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce, - 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e, - 0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, - 0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d, - 0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c, - 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c, - 0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, - 0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8, - 0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3, - 0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5, - 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035, - 0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, - 0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258, - 0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e, - 0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798, - 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e, - 0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, - 0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3, - 0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503, - 0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715, - 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e, - 0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, - 0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2, - 0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4, - 0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf, - 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9, - 0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, - 0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f, - 0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834, - 0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22, - 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4, - 0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, - 0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, - 0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f, - 0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f, - 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79, - 0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, - 0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, - 0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676, - 0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460, - 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b, - 0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, - 0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed, - 0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb, - 0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680, - 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496, - 0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, - 0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156, - 0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d, - 0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b, - 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db, - 0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, - 0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6, - 0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0, - 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a, - 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c, - 0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, - 0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61, - 0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81, - 0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97, - 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec, - 0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, - 0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c, - 0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a, - 0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41, - 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957, - 0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, - 0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1, - 0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da, - 0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc, - 0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, - 0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e, - 0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa, - 0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9, - 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653, - 0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, - 0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834, - 0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27, - 0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301, - 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712, - 0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, - 0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975, - 0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf, - 0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc, - 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8, - 0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, - 0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4, - 0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7, - 0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183, - 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590, - 0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, - 0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739, - 0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d, - 0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e, - 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678, - 0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, - 0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f, - 0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c, - 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6, - 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5, - 0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, - 0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, - 0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f, - 0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c, - 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08, - 0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, - 0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1, - 0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2, - 0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6, - 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5, - 0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, - 0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0, - 0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794, - 0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387, - 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d, - 0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, - 0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a, - 0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49, - 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516, - 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105, - 0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, - 0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, - 0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8, - 0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb, - 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf, - 0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, - 0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a, - 0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899, - 0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed, - 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe, - 0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, - 0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457, - 0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23, - 0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30, - 0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, - 0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919, - 0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56, - 0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac, - 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8, - 0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, - 0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d, - 0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387, - 0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5, - 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f, - 0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, - 0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa, - 0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e, - 0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64, - 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b, - 0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, - 0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e, - 0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4, - 0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb, - 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041, - 0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, - 0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, - 0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90, - 0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a, - 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758, - 0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, - 0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced, - 0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217, - 0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673, - 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889, - 0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, - 0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c, - 0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239, - 0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3, - 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c, - 0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, - 0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312, - 0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8, - 0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7, - 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d, - 0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, - 0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, - 0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda, - 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520, - 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144, - 0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, - 0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, - 0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b, - 0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4, - 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e, - 0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, - 0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b, - 0x061d761c, 0xcab77682, 0x44387161, 0x889271ff, - 0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05, - 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a, - 0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, - 0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, - 0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78, - 0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937, - 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd, - 0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, - 0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, - 0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c, - 0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6, -#endif /* CRC32_SLICE8 */ -}; diff --git a/2.0/libdeflate/lib/crc32_vec_template.h b/2.0/libdeflate/lib/crc32_vec_template.h deleted file mode 100644 index 9a2ad5bde..000000000 --- a/2.0/libdeflate/lib/crc32_vec_template.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * crc32_vec_template.h - template for vectorized CRC-32 implementations - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#define CRC32_SLICE1 1 -static u32 crc32_slice1(u32, const u8 *, size_t); - -/* - * Template for vectorized CRC-32 implementations. - * - * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead - * of crc32_slice8() because only a few bytes need to be processed, so a smaller - * table is preferable. - */ -static u32 ATTRIBUTES -FUNCNAME(u32 remainder, const u8 *p, size_t size) -{ - if ((uintptr_t)p % IMPL_ALIGNMENT) { - size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT); - - remainder = crc32_slice1(remainder, p, n); - p += n; - size -= n; - } - if (size >= IMPL_SEGMENT_SIZE) { - remainder = FUNCNAME_ALIGNED(remainder, (const void *)p, - size / IMPL_SEGMENT_SIZE); - p += size - (size % IMPL_SEGMENT_SIZE); - size %= IMPL_SEGMENT_SIZE; - } - return crc32_slice1(remainder, p, size); -} - -#undef FUNCNAME -#undef FUNCNAME_ALIGNED -#undef ATTRIBUTES -#undef IMPL_ALIGNMENT -#undef IMPL_SEGMENT_SIZE diff --git a/2.0/libdeflate/lib/decompress_template.h b/2.0/libdeflate/lib/decompress_template.h index a8cad0053..6036d0bca 100644 --- a/2.0/libdeflate/lib/decompress_template.h +++ b/2.0/libdeflate/lib/decompress_template.h @@ -179,11 +179,11 @@ FUNCNAME(struct libdeflate_decompressor * restrict d, /* Run-length encoded codeword lengths */ /* - * Note: we don't need verify that the repeat count - * doesn't overflow the number of elements, since we've - * sized the lens array to have enough extra space to - * allow for the worst-case overrun (138 zeroes when - * only 1 length was remaining). + * Note: we don't need to immediately verify that the + * repeat count doesn't overflow the number of elements, + * since we've sized the lens array to have enough extra + * space to allow for the worst-case overrun (138 zeroes + * when only 1 length was remaining). * * In the case of the small repeat counts (presyms 16 * and 17), it is fastest to always write the maximum @@ -241,6 +241,9 @@ FUNCNAME(struct libdeflate_decompressor * restrict d, } } while (i < num_litlen_syms + num_offset_syms); + /* Unnecessary, but check this for consistency with zlib. */ + SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { u16 len, nlen; diff --git a/2.0/libdeflate/lib/deflate_compress.c b/2.0/libdeflate/lib/deflate_compress.c index f957987c4..11c007e18 100644 --- a/2.0/libdeflate/lib/deflate_compress.c +++ b/2.0/libdeflate/lib/deflate_compress.c @@ -28,8 +28,6 @@ #include "deflate_compress.h" #include "deflate_constants.h" -#include "libdeflate.h" - /******************************************************************************/ /* @@ -136,7 +134,8 @@ * BIT_COST should be a power of 2. A value of 8 or 16 works well. A higher * value isn't very useful since the calculations are approximate anyway. * - * BIT_COST doesn't apply to deflate_flush_block(), which considers whole bits. + * BIT_COST doesn't apply to deflate_flush_block() and + * deflate_compute_true_cost(), which consider whole bits. */ #define BIT_COST 16 @@ -286,46 +285,26 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { }; /* - * A condensed table which maps offset => offset slot as follows: - * - * offset <= 256: deflate_offset_slot[offset] - * offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)] - * - * This table was generated by scripts/gen_offset_slot_map.py. + * Table: 'offset - 1 => offset_slot' for offset <= 256. + * This was generated by scripts/gen_offset_slot_map.py. */ -static const u8 deflate_offset_slot[512] = { - 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, - 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, - 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, +static const u8 deflate_offset_slot[256] = { + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, - 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, - 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, }; /* The order in which precode codeword lengths are stored */ @@ -478,6 +457,9 @@ struct libdeflate_compressor { void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os); + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; + /* The compression level with which this compressor was created */ unsigned compression_level; @@ -604,6 +586,9 @@ struct libdeflate_compressor { /* The current cost model being used */ struct deflate_costs costs; + /* Saved cost model */ + struct deflate_costs costs_saved; + /* * A table that maps match offset to offset slot. This * differs from deflate_offset_slot[] in that this is a @@ -628,7 +613,48 @@ struct libdeflate_compressor { u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; - unsigned num_optim_passes; + /* + * The maximum number of optimization passes + * (min-cost path searches) per block. + * Larger values = more compression. + */ + unsigned max_optim_passes; + + /* + * If an optimization pass improves the cost by fewer + * than this number of bits, then optimization will stop + * early, before max_optim_passes has been reached. + * Smaller values = more compression. + */ + unsigned min_improvement_to_continue; + + /* + * The minimum number of bits that would need to be + * saved for it to be considered worth the time to + * regenerate and use the min-cost path from a previous + * optimization pass, in the case where the final + * optimization pass actually increased the cost. + * Smaller values = more compression. + */ + unsigned min_bits_to_use_nonfinal_path; + + /* + * The maximum block length, in uncompressed bytes, at + * which to find and consider the optimal match/literal + * list for the static Huffman codes. This strategy + * improves the compression ratio produced by static + * Huffman blocks and can discover more cases in which + * static blocks are worthwhile. This helps mostly with + * small blocks, hence why this parameter is a max_len. + * + * Above this block length, static Huffman blocks are + * only used opportunistically. I.e. a static Huffman + * block is only used if a static block using the same + * match/literal list as the optimized dynamic block + * happens to be cheaper than the dynamic block itself. + */ + unsigned max_len_to_optimize_static_block; + } n; /* (n)ear-optimal */ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ @@ -676,24 +702,12 @@ struct deflate_output_bitstream { */ u8 *next; - /* - * Pointer to near the end of the output buffer. 'next' will never - * exceed this. There are OUTPUT_END_PADDING bytes reserved after this - * to allow branchlessly writing a whole word at this location. - */ + /* Pointer to the end of the output buffer */ u8 *end; -}; -/* - * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be - * present following os->end, in order to not overrun the buffer when generating - * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) - * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, - * to make the compression algorithm produce the same result on all CPU - * architectures (which is sometimes desirable), we have to unconditionally use - * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. - */ -#define OUTPUT_END_PADDING 8 + /* true if the output buffer ran out of space */ + bool overflow; +}; /* * Add some bits to the bitbuffer variable of the output bitstream. The caller @@ -707,21 +721,29 @@ do { \ ASSERT(bitcount <= BITBUF_NBITS); \ } while (0) -/* Flush bits from the bitbuffer variable to the output buffer. */ +/* + * Flush bits from the bitbuffer variable to the output buffer. After this, the + * bitbuffer will contain at most 7 bits (a partial byte). + * + * Since deflate_flush_block() verified ahead of time that there is enough space + * remaining before actually writing the block, it's guaranteed that out_next + * won't exceed os->end. However, there might not be enough space remaining to + * flush a whole word, even though that's fastest. Therefore, flush a whole + * word if there is space for it, otherwise flush a byte at a time. + */ #define FLUSH_BITS() \ do { \ - if (UNALIGNED_ACCESS_IS_FAST) { \ + if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \ /* Flush a whole word (branchlessly). */ \ put_unaligned_leword(bitbuf, out_next); \ bitbuf >>= bitcount & ~7; \ - out_next += MIN((size_t)(out_end - out_next), bitcount >> 3); \ + out_next += bitcount >> 3; \ bitcount &= 7; \ } else { \ /* Flush a byte at a time. */ \ while (bitcount >= 8) { \ - *out_next = bitbuf; \ - if (out_next != out_end) \ - out_next++; \ + ASSERT(out_next < os->end); \ + *out_next++ = bitbuf; \ bitcount -= 8; \ bitbuf >>= 8; \ } \ @@ -771,8 +793,7 @@ heapify_array(u32 A[], unsigned length) * Sort the array 'A', which contains 'length' unsigned 32-bit integers. * * Note: name this function heap_sort() instead of heapsort() to avoid colliding - * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't - * necessary when compiling with -D_ANSI_SOURCE, which is the better solution. + * with heapsort() from stdlib.h on BSD-derived systems. */ static void heap_sort(u32 A[], unsigned length) @@ -1311,7 +1332,6 @@ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, * eventually return the codewords. */ num_used_syms = sort_symbols(num_syms, freqs, lens, A); - /* * 'num_used_syms' is the number of symbols with nonzero frequency. * This may be less than @num_syms. 'num_used_syms' is also the number @@ -1320,30 +1340,34 @@ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, */ /* - * Handle special cases where only 0 or 1 symbols were used (had nonzero - * frequency). + * A complete Huffman code must contain at least 2 codewords. Yet, it's + * possible that fewer than 2 symbols were used. When this happens, + * it's usually for the offset code (0-1 symbols used). But it's also + * theoretically possible for the litlen and pre codes (1 symbol used). + * + * The DEFLATE RFC explicitly allows the offset code to contain just 1 + * codeword, or even be completely empty. But it's silent about the + * other codes. It also doesn't say whether, in the 1-codeword case, + * the codeword (which it says must be 1 bit) is '0' or '1'. + * + * In any case, some DEFLATE decompressors reject these cases. zlib + * generally allows them, but it does reject precodes that have just 1 + * codeword. More problematically, zlib v1.2.1 and earlier rejected + * empty offset codes, and this behavior can also be seen in Windows + * Explorer's ZIP unpacker (supposedly even still in Windows 11). + * + * Other DEFLATE compressors, including zlib, always send at least 2 + * codewords in order to make a complete Huffman code. Therefore, this + * is a case where practice does not entirely match the specification. + * We follow practice by generating 2 codewords of length 1: codeword + * '0' for symbol 0, and codeword '1' for another symbol -- the used + * symbol if it exists and is not symbol 0, otherwise symbol 1. This + * does worsen the compression ratio by having to send an unnecessary + * offset codeword length. But this only affects rare cases such as + * blocks containing all literals, and it only makes a tiny difference. */ - - if (unlikely(num_used_syms == 0)) { - /* - * Code is empty. sort_symbols() already set all lengths to 0, - * so there is nothing more to do. - */ - return; - } - - if (unlikely(num_used_syms == 1)) { - /* - * Only one symbol was used, so we only need one codeword. But - * two codewords are needed to form the smallest complete - * Huffman code, which uses codewords 0 and 1. Therefore, we - * choose another symbol to which to assign a codeword. We use - * 0 (if the used symbol is not 0) or 1 (if the used symbol is - * 0). In either case, the lesser-valued symbol must be - * assigned codeword 0 so that the resulting code is canonical. - */ - - unsigned sym = A[0] & SYMBOL_MASK; + if (unlikely(num_used_syms < 2)) { + unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0; unsigned nonzero_idx = sym ? sym : 1; codewords[0] = 0; @@ -1427,20 +1451,30 @@ deflate_init_static_codes(struct libdeflate_compressor *c) /* Return the offset slot for the given match offset, using the small map. */ static forceinline unsigned -deflate_get_offset_slot(unsigned offset) +deflate_get_offset_slot(u32 offset) { -#if 1 - if (offset <= 256) - return deflate_offset_slot[offset]; - else - return deflate_offset_slot[256 + ((offset - 1) >> 7)]; -#else /* Branchless version */ - u32 i1 = offset; - u32 i2 = 256 + ((offset - 1) >> 7); - u32 is_small = (s32)(offset - 257) >> 31; + /* + * 1 <= offset <= 32768 here. For 1 <= offset <= 256, + * deflate_offset_slot[offset - 1] gives the slot. + * + * For 257 <= offset <= 32768, we take advantage of the fact that 257 is + * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 == + * 128 times larger than each slot [2..16) (since the number of extra + * bits increases by 1 every 2 slots). Thus, the slot is: + * + * deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2) + * == deflate_offset_slot[((offset - 1) >> 7)] + 14 + * + * Define 'n = (offset <= 256) ? 0 : 7'. Then any offset is handled by: + * + * deflate_offset_slot[(offset - 1) >> n] + (n << 1) + * + * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with + * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'. + */ + unsigned n = (256 - offset) >> 29; - return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)]; -#endif + return deflate_offset_slot[(offset - 1) >> n] + (n << 1); } static unsigned @@ -1660,6 +1694,12 @@ do { \ /* * Choose the best type of block to use (dynamic Huffman, static Huffman, or * uncompressed), then output it. + * + * The uncompressed data of the block is @block_begin[0..@block_length-1]. The + * sequence of literals and matches that will be used to compress the block (if + * a compressed block is chosen) is given by @sequences if it's non-NULL, or + * else @c->p.n.optimum_nodes. @c->freqs and @c->codes must be already set + * according to the literals, matches, and end-of-block symbol. */ static void deflate_flush_block(struct libdeflate_compressor *c, @@ -1681,29 +1721,26 @@ deflate_flush_block(struct libdeflate_compressor *c, bitbuf_t bitbuf = os->bitbuf; unsigned bitcount = os->bitcount; u8 *out_next = os->next; - u8 * const out_end = os->end; - /* The cost for each block type, in bits */ - u32 dynamic_cost = 0; - u32 static_cost = 0; - u32 uncompressed_cost = 0; + u8 * const out_fast_end = + os->end - MIN(WORDBYTES - 1, os->end - out_next); + /* + * The cost for each block type, in bits. Start with the cost of the + * block header which is 3 bits. + */ + u32 dynamic_cost = 3; + u32 static_cost = 3; + u32 uncompressed_cost = 3; u32 best_cost; struct deflate_codes *codes; unsigned sym; - ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block); + ASSERT(block_length >= MIN_BLOCK_LENGTH || + (is_final_block && block_length > 0)); ASSERT(block_length <= MAX_BLOCK_LENGTH); ASSERT(bitcount <= 7); ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); - ASSERT(out_next <= out_end); - - if (sequences != NULL /* !near_optimal */ || - !SUPPORT_NEAR_OPTIMAL_PARSING) { - /* Tally the end-of-block symbol. */ - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; - - /* Build dynamic Huffman codes. */ - deflate_make_huffman_codes(&c->freqs, &c->codes); - } /* Else, this was already done. */ + ASSERT(out_next <= os->end); + ASSERT(!os->overflow); /* Precompute the precode items and build the precode. */ deflate_precompute_huffman_header(c); @@ -1761,9 +1798,71 @@ deflate_flush_block(struct libdeflate_compressor *c, UINT16_MAX) - 1)) + (8 * block_length); - /* Choose and output the cheapest type of block. */ - best_cost = MIN(static_cost, uncompressed_cost); - if (dynamic_cost < best_cost) { + /* + * Choose and output the cheapest type of block. If there is a tie, + * prefer uncompressed, then static, then dynamic. + */ + + best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost)); + + /* If the block isn't going to fit, then stop early. */ + if (DIV_ROUND_UP(bitcount + best_cost, 8) > (size_t)(os->end - out_next)) { + os->overflow = true; + return; + } + /* + * Else, now we know that the block fits, so no further bounds checks on + * the output buffer are required until the next block. + */ + + if (best_cost == uncompressed_cost) { + /* + * Uncompressed block(s). DEFLATE limits the length of + * uncompressed blocks to UINT16_MAX bytes, so if the length of + * the "block" we're flushing is over UINT16_MAX, we actually + * output multiple blocks. + */ + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; + + if (in_end - in_next <= UINT16_MAX) { + bfinal = is_final_block; + len = in_end - in_next; + } + /* It was already checked that there is enough space. */ + ASSERT(os->end - out_next >= + (long)(DIV_ROUND_UP(bitcount + 3, 8) + 4 + len)); + /* + * Output BFINAL (1 bit) and BTYPE (2 bits), then align + * to a byte boundary. + */ + STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); + *out_next++ = (bfinal << bitcount) | bitbuf; + if (bitcount > 5) + *out_next++ = 0; + bitbuf = 0; + bitcount = 0; + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + /* Done outputting uncompressed block(s) */ + goto out; + } + + if (best_cost == static_cost) { + /* Static Huffman block */ + codes = &c->static_codes; + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); + FLUSH_BITS(); + } else { const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; const unsigned num_precode_items = c->o.precode.num_items; unsigned precode_sym, precode_item; @@ -1771,7 +1870,6 @@ deflate_flush_block(struct libdeflate_compressor *c, /* Dynamic Huffman block */ - best_cost = dynamic_cost; codes = &c->codes; STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); ADD_BITS(is_final_block, 1); @@ -1823,54 +1921,6 @@ deflate_flush_block(struct libdeflate_compressor *c, deflate_extra_precode_bits[precode_sym]); FLUSH_BITS(); } while (++i < num_precode_items); - } else if (static_cost < uncompressed_cost) { - /* Static Huffman block */ - codes = &c->static_codes; - ADD_BITS(is_final_block, 1); - ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); - FLUSH_BITS(); - } else { - /* - * Uncompressed block(s). DEFLATE limits the length of - * uncompressed blocks to UINT16_MAX bytes, so if the length of - * the "block" we're flushing is over UINT16_MAX, we actually - * output multiple blocks. - */ - do { - u8 bfinal = 0; - size_t len = UINT16_MAX; - - if (in_end - in_next <= UINT16_MAX) { - bfinal = is_final_block; - len = in_end - in_next; - } - if ((size_t)(out_end - out_next) < - (bitcount + 3 + 7) / 8 + 4 + len) { - /* Not enough output space remaining. */ - out_next = out_end; - goto out; - } - /* - * Output BFINAL (1 bit) and BTYPE (2 bits), then align - * to a byte boundary. - */ - STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); - *out_next++ = (bfinal << bitcount) | bitbuf; - if (bitcount > 5) - *out_next++ = 0; - bitbuf = 0; - bitcount = 0; - /* Output LEN and NLEN, then the data itself. */ - put_unaligned_le16(len, out_next); - out_next += 2; - put_unaligned_le16(~len, out_next); - out_next += 2; - memcpy(out_next, in_next, len); - out_next += len; - in_next += len; - } while (in_next != in_end); - /* Done outputting uncompressed block(s) */ - goto out; } /* Output the literals and matches for a dynamic or static block. */ @@ -1974,18 +2024,30 @@ deflate_flush_block(struct libdeflate_compressor *c, out: ASSERT(bitcount <= 7); /* - * Assert that the block cost was computed correctly, as + * Assert that the block cost was computed correctly. This is relied on + * above for the bounds check on the output buffer. Also, * libdeflate_deflate_compress_bound() relies on this via the assumption - * that uncompressed blocks will always be used when cheaper. + * that uncompressed blocks will always be used when cheapest. */ - ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == - 3 + best_cost || out_next == out_end); - + ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost); os->bitbuf = bitbuf; os->bitcount = bitcount; os->next = out_next; } +static void +deflate_finish_block(struct libdeflate_compressor *c, + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct deflate_sequence *sequences, + bool is_final_block) +{ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + deflate_make_huffman_codes(&c->freqs, &c->codes); + deflate_flush_block(c, os, block_begin, block_length, sequences, + is_final_block); +} + /******************************************************************************/ /* @@ -2270,6 +2332,13 @@ calculate_min_match_len(const u8 *data, size_t data_len, unsigned num_used_literals = 0; size_t i; + /* + * For very short inputs, the static Huffman code has a good chance of + * being best, in which case there is no reason to avoid short matches. + */ + if (data_len < 512) + return DEFLATE_MIN_MATCH_LEN; + /* * For an initial approximation, scan the first 4 KiB of data. The * caller may use recalculate_min_match_len() to update min_len later. @@ -2446,10 +2515,10 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c, } while (in_next < in_max_block_end && seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); - deflate_flush_block(c, os, in_block_begin, - in_next - in_block_begin, - c->p.f.sequences, in_next == in_end); - } while (in_next != in_end); + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.f.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); } /* @@ -2525,10 +2594,10 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c, !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - deflate_flush_block(c, os, in_block_begin, - in_next - in_block_begin, - c->p.g.sequences, in_next == in_end); - } while (in_next != in_end); + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.g.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); } static forceinline void @@ -2731,10 +2800,10 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - deflate_flush_block(c, os, in_block_begin, - in_next - in_block_begin, - c->p.g.sequences, in_next == in_end); - } while (in_next != in_end); + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.g.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); } /* @@ -2797,6 +2866,59 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; } +static void +deflate_choose_all_literals(struct libdeflate_compressor *c, + const u8 *block, u32 block_length) +{ + u32 i; + + deflate_reset_symbol_frequencies(c); + for (i = 0; i < block_length; i++) + c->freqs.litlen[block[i]]++; + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + + deflate_make_huffman_codes(&c->freqs, &c->codes); +} + +/* + * Compute the exact cost, in bits, that would be required to output the matches + * and literals described by @c->freqs as a dynamic Huffman block. The litlen + * and offset codes are assumed to have already been built in @c->codes. + */ +static u32 +deflate_compute_true_cost(struct libdeflate_compressor *c) +{ + u32 cost = 0; + unsigned sym; + + deflate_precompute_huffman_header(c); + + memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0, + DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms); + + cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); + for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { + cost += c->o.precode.freqs[sym] * + (c->o.precode.lens[sym] + + deflate_extra_precode_bits[sym]); + } + + for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++) + cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; + + for (; sym < DEFLATE_FIRST_LEN_SYM + + ARRAY_LEN(deflate_extra_length_bits); sym++) + cost += c->freqs.litlen[sym] * + (c->codes.lens.litlen[sym] + + deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]); + + for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) + cost += c->freqs.offset[sym] * + (c->codes.lens.offset[sym] + + deflate_extra_offset_bits[sym]); + return cost; +} + /* Set the current cost model from the codeword lengths specified in @lens. */ static void deflate_set_costs_from_codes(struct libdeflate_compressor *c, @@ -3124,11 +3246,11 @@ deflate_adjust_costs_impl(struct libdeflate_compressor *c, /* * Adjust the costs when beginning a new block. * - * Since the current costs have been optimized for the data, it's undesirable to - * throw them away and start over with the default costs. At the same time, we - * don't want to bias the parse by assuming that the next block will be similar - * to the current block. As a compromise, make the costs closer to the - * defaults, but don't simply set them to the defaults. + * Since the current costs are optimized for the data already, it can be helpful + * to reuse them instead of starting over with the default costs. However, this + * depends on how similar the new block is to the previous block. Therefore, + * use a heuristic to decide how similar the blocks are, and mix together the + * current costs and the default costs accordingly. */ static void deflate_adjust_costs(struct libdeflate_compressor *c, @@ -3159,7 +3281,10 @@ deflate_adjust_costs(struct libdeflate_compressor *c, cutoff = ((u64)c->p.n.prev_num_observations * c->split_stats.num_observations * 200) / 512; - if (4 * total_delta > 9 * cutoff) + if (total_delta > 3 * cutoff) + /* Big change in the data; just use the default costs. */ + deflate_set_default_costs(c, lit_cost, len_sym_cost); + else if (4 * total_delta > 9 * cutoff) deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3); else if (2 * total_delta > 3 * cutoff) deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2); @@ -3169,6 +3294,21 @@ deflate_adjust_costs(struct libdeflate_compressor *c, deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0); } +static void +deflate_set_initial_costs(struct libdeflate_compressor *c, + const u8 *block_begin, u32 block_length, + bool is_first_block) +{ + u32 lit_cost, len_sym_cost; + + deflate_choose_default_litlen_costs(c, block_begin, block_length, + &lit_cost, &len_sym_cost); + if (is_first_block) + deflate_set_default_costs(c, lit_cost, len_sym_cost); + else + deflate_adjust_costs(c, lit_cost, len_sym_cost); +} + /* * Find the minimum-cost path through the graph of possible match/literal * choices for this block. @@ -3251,29 +3391,52 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c, } cur_node->cost_to_end = best_cost_to_end; } while (cur_node != &c->p.n.optimum_nodes[0]); + + deflate_reset_symbol_frequencies(c); + deflate_tally_item_list(c, block_length); + deflate_make_huffman_codes(&c->freqs, &c->codes); } /* - * Choose the literal/match sequence to use for the current block. The basic - * algorithm finds a minimum-cost path through the block's graph of - * literal/match choices, given a cost model. However, the cost of each symbol - * is unknown until the Huffman codes have been built, but at the same time the - * Huffman codes depend on the frequencies of chosen symbols. Consequently, - * multiple passes must be used to try to approximate an optimal solution. The - * first pass uses default costs, mixed with the costs from the previous block - * if any. Later passes use the Huffman codeword lengths from the previous pass - * as the costs. + * Choose the literals and matches for the current block, then output the block. + * + * To choose the literal/match sequence, we find the minimum-cost path through + * the block's graph of literal/match choices, given a cost model. However, the + * true cost of each symbol is unknown until the Huffman codes have been built, + * but at the same time the Huffman codes depend on the frequencies of chosen + * symbols. Consequently, multiple passes must be used to try to approximate an + * optimal solution. The first pass uses default costs, mixed with the costs + * from the previous block when it seems appropriate. Later passes use the + * Huffman codeword lengths from the previous pass as the costs. + * + * As an alternate strategy, also consider using only literals. The boolean + * returned in *used_only_literals indicates whether that strategy was best. */ static void -deflate_optimize_block(struct libdeflate_compressor *c, - const u8 *block_begin, u32 block_length, - const struct lz_match *cache_ptr, bool is_first_block, - bool is_final_block) +deflate_optimize_and_flush_block(struct libdeflate_compressor *c, + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct lz_match *cache_ptr, + bool is_first_block, bool is_final_block, + bool *used_only_literals) { - unsigned num_passes_remaining = c->p.n.num_optim_passes; - u32 lit_cost, len_sym_cost; + unsigned num_passes_remaining = c->p.n.max_optim_passes; + u32 best_true_cost = UINT32_MAX; + u32 true_cost; + u32 only_lits_cost; + u32 static_cost = UINT32_MAX; + struct deflate_sequence seq_; + struct deflate_sequence *seq = NULL; u32 i; + /* + * On some data, using only literals (no matches) ends up being better + * than what the iterative optimization algorithm produces. Therefore, + * consider using only literals. + */ + deflate_choose_all_literals(c, block_begin, block_length); + only_lits_cost = deflate_compute_true_cost(c); + /* * Force the block to really end at the desired length, even if some * matches extend beyond it. @@ -3283,33 +3446,86 @@ deflate_optimize_block(struct libdeflate_compressor *c, ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; - /* Set the initial costs. */ - deflate_choose_default_litlen_costs(c, block_begin, block_length, - &lit_cost, &len_sym_cost); - if (is_first_block) - deflate_set_default_costs(c, lit_cost, len_sym_cost); - else - deflate_adjust_costs(c, lit_cost, len_sym_cost); + /* + * Sometimes a static Huffman block ends up being cheapest, particularly + * if the block is small. So, if the block is sufficiently small, find + * the optimal static block solution and remember its cost. + */ + if (block_length <= c->p.n.max_len_to_optimize_static_block) { + /* Save c->p.n.costs temporarily. */ + c->p.n.costs_saved = c->p.n.costs; - do { - /* Find the minimum cost path for this pass. */ + deflate_set_costs_from_codes(c, &c->static_codes.lens); deflate_find_min_cost_path(c, block_length, cache_ptr); + static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST; + static_cost += 7; /* for the end-of-block symbol */ - /* Compute frequencies of the chosen symbols. */ - deflate_reset_symbol_frequencies(c); - deflate_tally_item_list(c, block_length); + /* Restore c->p.n.costs. */ + c->p.n.costs = c->p.n.costs_saved; + } + + /* Initialize c->p.n.costs with default costs. */ + deflate_set_initial_costs(c, block_begin, block_length, is_first_block); + + do { + /* + * Find the minimum-cost path for this pass. + * Also set c->freqs and c->codes to match the path. + */ + deflate_find_min_cost_path(c, block_length, cache_ptr); - /* Make the Huffman codes. */ - deflate_make_huffman_codes(&c->freqs, &c->codes); + /* + * Compute the exact cost of the block if the path were to be + * used. Note that this differs from + * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses + * the actual Huffman codes instead of c->p.n.costs. + */ + true_cost = deflate_compute_true_cost(c); /* - * Update the costs. After the last optimization pass, the - * final costs won't be needed for this block, but they will be - * used in determining the initial costs for the next block. + * If the cost didn't improve much from the previous pass, then + * doing more passes probably won't be helpful, so stop early. */ - if (--num_passes_remaining || !is_final_block) + if (true_cost + c->p.n.min_improvement_to_continue > + best_true_cost) + break; + + best_true_cost = true_cost; + + /* Save the cost model that gave 'best_true_cost'. */ + c->p.n.costs_saved = c->p.n.costs; + + /* Update the cost model from the Huffman codes. */ + deflate_set_costs_from_codes(c, &c->codes.lens); + + } while (--num_passes_remaining); + + *used_only_literals = false; + if (MIN(only_lits_cost, static_cost) < best_true_cost) { + if (only_lits_cost < static_cost) { + /* Using only literals ended up being best! */ + deflate_choose_all_literals(c, block_begin, block_length); deflate_set_costs_from_codes(c, &c->codes.lens); - } while (num_passes_remaining); + seq_.litrunlen_and_length = block_length; + seq = &seq_; + *used_only_literals = true; + } else { + /* Static block ended up being best! */ + deflate_set_costs_from_codes(c, &c->static_codes.lens); + deflate_find_min_cost_path(c, block_length, cache_ptr); + } + } else if (true_cost >= + best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { + /* + * The best solution was actually from a non-final optimization + * pass, so recover and use the min-cost path from that pass. + */ + c->p.n.costs = c->p.n.costs_saved; + deflate_find_min_cost_path(c, block_length, cache_ptr); + deflate_set_costs_from_codes(c, &c->codes.lens); + } + deflate_flush_block(c, os, block_begin, block_length, seq, + is_final_block); } static void @@ -3387,6 +3603,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, unsigned nice_len = MIN(c->nice_match_length, max_len); struct lz_match *cache_ptr = c->p.n.match_cache; u32 next_hashes[2] = {0, 0}; + bool prev_block_used_only_literals = false; bt_matchfinder_init(&c->p.n.bt_mf); deflate_near_optimal_init_stats(c); @@ -3405,8 +3622,16 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, * literal/match statistics gathered during matchfinding. * However, the actual near-optimal parse won't respect min_len, * as it can accurately assess the costs of different matches. + * + * If the "use only literals" strategy happened to be the best + * strategy on the previous block, then probably the + * min_match_len heuristic is still not aggressive enough for + * the data, so force gathering literal stats only. */ - min_len = calculate_min_match_len( + if (prev_block_used_only_literals) + min_len = DEFLATE_MAX_MATCH_LEN + 1; + else + min_len = calculate_min_match_len( in_block_begin, in_max_block_end - in_block_begin, c->max_search_depth); @@ -3583,10 +3808,11 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, } while (--num_bytes_to_rewind); cache_len_rewound = orig_cache_ptr - cache_ptr; - deflate_optimize_block(c, in_block_begin, block_length, - cache_ptr, is_first, is_final); - deflate_flush_block(c, os, in_block_begin, block_length, - NULL, is_final); + deflate_optimize_and_flush_block( + c, os, in_block_begin, + block_length, cache_ptr, + is_first, is_final, + &prev_block_used_only_literals); memmove(c->p.n.match_cache, cache_ptr, cache_len_rewound * sizeof(*cache_ptr)); cache_ptr = &c->p.n.match_cache[cache_len_rewound]; @@ -3608,16 +3834,17 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, bool is_final = (in_next == in_end); deflate_near_optimal_merge_stats(c); - deflate_optimize_block(c, in_block_begin, block_length, - cache_ptr, is_first, is_final); - deflate_flush_block(c, os, in_block_begin, block_length, - NULL, is_final); + deflate_optimize_and_flush_block( + c, os, in_block_begin, + block_length, cache_ptr, + is_first, is_final, + &prev_block_used_only_literals); cache_ptr = &c->p.n.match_cache[0]; deflate_near_optimal_save_stats(c); deflate_near_optimal_init_stats(c); in_block_begin = in_next; } - } while (in_next != in_end); + } while (in_next != in_end && !os->overflow); } /* Initialize c->p.n.offset_slot_full. */ @@ -3641,14 +3868,22 @@ deflate_init_offset_slot_full(struct libdeflate_compressor *c) #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ -LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI -libdeflate_alloc_compressor(int compression_level) +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor_ex(int compression_level, + const struct libdeflate_options *options) { struct libdeflate_compressor *c; size_t size = offsetof(struct libdeflate_compressor, p); check_buildtime_parameters(); + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + if (compression_level < 0 || compression_level > 12) return NULL; @@ -3664,9 +3899,14 @@ libdeflate_alloc_compressor(int compression_level) size += sizeof(c->p.f); } - c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); + c = libdeflate_aligned_malloc(options->malloc_func ? + options->malloc_func : + libdeflate_default_malloc_func, + MATCHFINDER_MEM_ALIGNMENT, size); if (!c) return NULL; + c->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; c->compression_level = compression_level; @@ -3734,22 +3974,31 @@ libdeflate_alloc_compressor(int compression_level) c->impl = deflate_compress_near_optimal; c->max_search_depth = 35; c->nice_match_length = 75; - c->p.n.num_optim_passes = 2; + c->p.n.max_optim_passes = 2; + c->p.n.min_improvement_to_continue = 32; + c->p.n.min_bits_to_use_nonfinal_path = 32; + c->p.n.max_len_to_optimize_static_block = 0; deflate_init_offset_slot_full(c); break; case 11: c->impl = deflate_compress_near_optimal; - c->max_search_depth = 70; + c->max_search_depth = 100; c->nice_match_length = 150; - c->p.n.num_optim_passes = 3; + c->p.n.max_optim_passes = 4; + c->p.n.min_improvement_to_continue = 16; + c->p.n.min_bits_to_use_nonfinal_path = 16; + c->p.n.max_len_to_optimize_static_block = 1000; deflate_init_offset_slot_full(c); break; case 12: default: c->impl = deflate_compress_near_optimal; - c->max_search_depth = 150; + c->max_search_depth = 300; c->nice_match_length = DEFLATE_MAX_MATCH_LEN; - c->p.n.num_optim_passes = 4; + c->p.n.max_optim_passes = 10; + c->p.n.min_improvement_to_continue = 1; + c->p.n.min_bits_to_use_nonfinal_path = 1; + c->p.n.max_len_to_optimize_static_block = 10000; deflate_init_offset_slot_full(c); break; #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ @@ -3760,7 +4009,17 @@ libdeflate_alloc_compressor(int compression_level) return c; } -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI + +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor(int compression_level) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_compressor_ex(compression_level, &defaults); +} + +LIBDEFLATEAPI size_t libdeflate_deflate_compress(struct libdeflate_compressor *c, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail) @@ -3775,38 +4034,40 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c, return deflate_compress_none(in, in_nbytes, out, out_nbytes_avail); - /* - * Initialize the output bitstream structure. - * - * The end is set to OUTPUT_END_PADDING below the true end, so that - * FLUSH_BITS() can be more efficient. - */ - if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING)) - return 0; + /* Initialize the output bitstream structure. */ os.bitbuf = 0; os.bitcount = 0; os.next = out; - os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING; + os.end = os.next + out_nbytes_avail; + os.overflow = false; + + /* Call the actual compression function. */ (*c->impl)(c, in, in_nbytes, &os); + + /* Return 0 if the output buffer is too small. */ + if (os.overflow) + return 0; + /* - * If 'os.next' reached 'os.end', then either there was not enough space - * in the output buffer, or the compressed size would have been within - * OUTPUT_END_PADDING of the true end. For performance reasons we don't - * distinguish between these cases; we just make sure to return some - * extra space from libdeflate_deflate_compress_bound(). + * Write the final byte if needed. This can't overflow the output + * buffer because deflate_flush_block() would have set the overflow flag + * if there wasn't enough space remaining for the full final block. */ - if (os.next >= os.end) - return 0; ASSERT(os.bitcount <= 7); - if (os.bitcount) + if (os.bitcount) { + ASSERT(os.next < os.end); *os.next++ = os.bitbuf; + } + + /* Return the compressed size in bytes. */ return os.next - (u8 *)out; } -LIBDEFLATEEXPORT void LIBDEFLATEAPI +LIBDEFLATEAPI void libdeflate_free_compressor(struct libdeflate_compressor *c) { - libdeflate_aligned_free(c); + if (c) + libdeflate_aligned_free(c->free_func, c); } unsigned int @@ -3815,11 +4076,10 @@ libdeflate_get_compression_level(struct libdeflate_compressor *c) return c->compression_level; } -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_deflate_compress_bound(__attribute__((unused)) struct libdeflate_compressor *c, size_t in_nbytes) { - size_t bound = 0; size_t max_blocks; /* @@ -3836,10 +4096,12 @@ libdeflate_deflate_compress_bound(__attribute__((unused)) struct libdeflate_comp */ /* - * The minimum length that is passed to deflate_flush_block() is - * MIN_BLOCK_LENGTH bytes, except for the final block if needed. + * Calculate the maximum number of uncompressed blocks that the + * compressor can use for 'in_nbytes' of data. * - * If deflate_flush_block() decides to use an uncompressed block, it + * The minimum length that is passed to deflate_flush_block() is + * MIN_BLOCK_LENGTH bytes, except for the final block if needed. If + * deflate_flush_block() decides to use an uncompressed block, it * actually will (in general) output a series of uncompressed blocks in * order to stay within the UINT16_MAX limit of DEFLATE. But this can * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', @@ -3858,20 +4120,8 @@ libdeflate_deflate_compress_bound(__attribute__((unused)) struct libdeflate_comp * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the * alignment bits at the very start of the block can be disregarded; * they would otherwise increase the overhead to 6 bytes per block.) + * Therefore, the maximum number of overhead bytes is '5 * max_blocks'. + * To get the final bound, add the number of uncompressed bytes. */ - bound += 5 * max_blocks; - - /* Account for the data itself, stored uncompressed. */ - bound += in_nbytes; - - /* - * Add 1 + OUTPUT_END_PADDING because for performance reasons, the - * compressor doesn't distinguish between cases where there wasn't - * enough space and cases where the compressed size would have been - * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater. Adding - * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room. - */ - bound += 1 + OUTPUT_END_PADDING; - - return bound; + return (5 * max_blocks) + in_nbytes; } diff --git a/2.0/libdeflate/lib/deflate_decompress.c b/2.0/libdeflate/lib/deflate_decompress.c index d7d79e465..25d225239 100644 --- a/2.0/libdeflate/lib/deflate_decompress.c +++ b/2.0/libdeflate/lib/deflate_decompress.c @@ -42,13 +42,9 @@ * instructions enabled and is used automatically at runtime when supported. */ -#include - #include "lib_common.h" #include "deflate_constants.h" -#include "libdeflate.h" - /* * If the expression passed to SAFETY_CHECK() evaluates to false, then the * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the @@ -675,6 +671,9 @@ struct libdeflate_decompressor { bool static_codes_loaded; unsigned litlen_tablebits; + + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; }; /* @@ -806,38 +805,48 @@ build_decode_table(u32 decode_table[], u32 entry; unsigned i; + /* + * The DEFLATE RFC explicitly allows the offset code to be + * incomplete in two cases: a code containing just 1 codeword, + * if that codeword has length 1; and a code containing no + * codewords. Note: the list of offset codeword lengths is + * always nonempty, but lengths of 0 don't count as codewords. + * + * The RFC doesn't say whether the same cases are allowed for + * the litlen and pre codes. It's actually impossible for no + * symbols to be used from these codes; however, it's + * technically possible for only one symbol to be used. zlib + * allows 1 codeword for the litlen code, but not the precode. + * The RFC also doesn't say whether, when there is 1 codeword, + * that codeword is '0' or '1'. zlib uses '0'. + * + * We accept what zlib accepts, plus a bit more. First, we + * don't treat the precode more strictly than the litlen and + * offset codes. There's no convincing reason to add a special + * case for the precode here. + * + * Second, we just map each allowed incompete code to a complete + * code with only real symbols. To do this, we choose a symbol, + * either the used symbol (for codes with 1 codeword) or an + * arbitrary symbol (for empty codes), and give it both + * codewords '0' and '1'. zlib instead uses a special ERROR + * symbol in the part of the codespace the code doesn't use. + * However, having an ERROR symbol reduces the performance of + * the Huffman decoder, for no real benefit. Our approach also + * avoids having to decide whether '0' or '1' is correct. + * + * Like zlib, we still reject all incomplete codes that contain + * more than 1 codeword or a codeword length greater than 1. + */ if (codespace_used == 0) { - /* - * An empty code is allowed. This can happen for the - * offset code in DEFLATE, since a dynamic Huffman block - * need not contain any matches. - */ - - /* sym=0, len=1 (arbitrary) */ - entry = make_decode_table_entry(decode_results, 0, 1); + sym = 0; /* arbitrary */ } else { - /* - * Allow codes with a single used symbol, with codeword - * length 1. The DEFLATE RFC is unclear regarding this - * case. What zlib's decompressor does is permit this - * for the litlen and offset codes and assume the - * codeword is '0' rather than '1'. We do the same - * except we allow this for precodes too, since there's - * no convincing reason to treat the codes differently. - * We also assign both codewords '0' and '1' to the - * symbol to avoid having to handle '1' specially. - */ if (codespace_used != (1U << (max_codeword_len - 1)) || len_counts[1] != 1) return false; - entry = make_decode_table_entry(decode_results, - *sorted_syms, 1); + sym = sorted_syms[0]; } - /* - * Note: the decode table still must be fully initialized, in - * case the stream is malformed and contains bits from the part - * of the codespace the incomplete code doesn't use. - */ + entry = make_decode_table_entry(decode_results, sym, 1); for (i = 0; i < (1U << table_bits); i++) decode_table[i] = entry; return true; @@ -1075,7 +1084,7 @@ typedef enum libdeflate_result (*decompress_func_t) /* Include architecture-specific implementation(s) if available. */ #undef DEFAULT_IMPL #undef arch_select_decompress_func -#if defined(__i386__) || defined(__x86_64__) +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/decompress_impl.h" #endif @@ -1121,7 +1130,7 @@ dispatch_decomp(struct libdeflate_decompressor *d, * handles calling the appropriate implementation depending on the CPU features * at runtime. */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -1132,7 +1141,7 @@ libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, actual_in_nbytes_ret, actual_out_nbytes_ret); } -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_deflate_decompress(struct libdeflate_decompressor *d, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -1143,9 +1152,22 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor *d, NULL, actual_out_nbytes_ret); } -LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI -libdeflate_alloc_decompressor(void) +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options) { + struct libdeflate_decompressor *d; + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + d = (options->malloc_func ? options->malloc_func : + libdeflate_default_malloc_func)(sizeof(*d)); + if (d == NULL) + return NULL; /* * Note that only certain parts of the decompressor actually must be * initialized here: @@ -1159,18 +1181,28 @@ libdeflate_alloc_decompressor(void) * valgrind, since build_decode_table() is guaranteed to initialize * all entries eventually anyway.) * + * - 'free_func' must be set. + * * But for simplicity, we currently just zero the whole decompressor. */ - struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d)); - - if (d == NULL) - return NULL; memset(d, 0, sizeof(*d)); + d->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; return d; } -LIBDEFLATEEXPORT void LIBDEFLATEAPI +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_decompressor_ex(&defaults); +} + +LIBDEFLATEAPI void libdeflate_free_decompressor(struct libdeflate_decompressor *d) { - libdeflate_free(d); + if (d) + d->free_func(d); } diff --git a/2.0/libdeflate/lib/gzip_compress.c b/2.0/libdeflate/lib/gzip_compress.c index 124375291..b7d5076e2 100644 --- a/2.0/libdeflate/lib/gzip_compress.c +++ b/2.0/libdeflate/lib/gzip_compress.c @@ -28,9 +28,7 @@ #include "deflate_compress.h" #include "gzip_constants.h" -#include "libdeflate.h" - -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_gzip_compress(struct libdeflate_compressor *c, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail) @@ -83,7 +81,7 @@ libdeflate_gzip_compress(struct libdeflate_compressor *c, return out_next - (u8 *)out; } -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_gzip_compress_bound(struct libdeflate_compressor *c, size_t in_nbytes) { diff --git a/2.0/libdeflate/lib/gzip_decompress.c b/2.0/libdeflate/lib/gzip_decompress.c index eb5c44335..30adc1769 100644 --- a/2.0/libdeflate/lib/gzip_decompress.c +++ b/2.0/libdeflate/lib/gzip_decompress.c @@ -28,9 +28,7 @@ #include "lib_common.h" #include "gzip_constants.h" -#include "libdeflate.h" - -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -134,7 +132,7 @@ libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, return LIBDEFLATE_SUCCESS; } -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_gzip_decompress(struct libdeflate_decompressor *d, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, diff --git a/2.0/libdeflate/lib/hc_matchfinder.h b/2.0/libdeflate/lib/hc_matchfinder.h index a690c0ae3..e9e2090a8 100644 --- a/2.0/libdeflate/lib/hc_matchfinder.h +++ b/2.0/libdeflate/lib/hc_matchfinder.h @@ -116,7 +116,7 @@ (((1UL << HC_MATCHFINDER_HASH3_ORDER) + \ (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) -struct hc_matchfinder { +struct MATCHFINDER_ALIGNED hc_matchfinder { /* The hash table for finding length 3 matches */ mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; @@ -128,8 +128,7 @@ struct hc_matchfinder { /* The "next node" references for the linked lists. The "next node" of * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE]; - -} MATCHFINDER_ALIGNED; +}; /* Prepare the matchfinder for a new input buffer. */ static forceinline void diff --git a/2.0/libdeflate/lib/ht_matchfinder.h b/2.0/libdeflate/lib/ht_matchfinder.h index 898e99394..7588073da 100644 --- a/2.0/libdeflate/lib/ht_matchfinder.h +++ b/2.0/libdeflate/lib/ht_matchfinder.h @@ -54,10 +54,10 @@ /* Minimum value of max_len for ht_matchfinder_longest_match() */ #define HT_MATCHFINDER_REQUIRED_NBYTES 5 -struct ht_matchfinder { +struct MATCHFINDER_ALIGNED ht_matchfinder { mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER] [HT_MATCHFINDER_BUCKET_SIZE]; -} MATCHFINDER_ALIGNED; +}; static forceinline void ht_matchfinder_init(struct ht_matchfinder *mf) diff --git a/2.0/libdeflate/lib/lib_common.h b/2.0/libdeflate/lib/lib_common.h index 10d0160ac..8c9ff5fe0 100644 --- a/2.0/libdeflate/lib/lib_common.h +++ b/2.0/libdeflate/lib/lib_common.h @@ -6,19 +6,48 @@ #define LIB_LIB_COMMON_H #ifdef LIBDEFLATE_H + /* + * When building the library, LIBDEFLATEAPI needs to be defined properly before + * including libdeflate.h. + */ # error "lib_common.h must always be included before libdeflate.h" - /* because BUILDING_LIBDEFLATE must be set first */ #endif -#define BUILDING_LIBDEFLATE +#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATE_EXPORT_SYM __declspec(dllexport) +#elif defined(__GNUC__) +# define LIBDEFLATE_EXPORT_SYM __attribute__((visibility("default"))) +#else +# define LIBDEFLATE_EXPORT_SYM +#endif + +/* + * On i386, gcc assumes that the stack is 16-byte aligned at function entry. + * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi) + * only guarantee 4-byte alignment when calling functions. This is mainly an + * issue on Windows, but it has been seen on Linux too. Work around this ABI + * incompatibility by realigning the stack pointer when entering libdeflate. + * This prevents crashes in SSE/AVX code. + */ +#if defined(__GNUC__) && defined(__i386__) +# define LIBDEFLATE_ALIGN_STACK __attribute__((force_align_arg_pointer)) +#else +# define LIBDEFLATE_ALIGN_STACK +#endif + +#define LIBDEFLATEAPI LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK #include "../common_defs.h" -void *libdeflate_malloc(size_t size); -void libdeflate_free(void *ptr); +typedef void *(*malloc_func_t)(size_t); +typedef void (*free_func_t)(void *); + +extern malloc_func_t libdeflate_default_malloc_func; +extern free_func_t libdeflate_default_free_func; -void *libdeflate_aligned_malloc(size_t alignment, size_t size); -void libdeflate_aligned_free(void *ptr); +void *libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size); +void libdeflate_aligned_free(free_func_t free_func, void *ptr); #ifdef FREESTANDING /* diff --git a/2.0/libdeflate/lib/matchfinder_common.h b/2.0/libdeflate/lib/matchfinder_common.h index e0e090530..48a243e1d 100644 --- a/2.0/libdeflate/lib/matchfinder_common.h +++ b/2.0/libdeflate/lib/matchfinder_common.h @@ -61,9 +61,9 @@ typedef s16 mf_pos_t; #undef matchfinder_rebase #ifdef _aligned_attribute # define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) -# if defined(__arm__) || defined(__aarch64__) +# if defined(ARCH_ARM32) || defined(ARCH_ARM64) # include "arm/matchfinder_impl.h" -# elif defined(__i386__) || defined(__x86_64__) +# elif defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/matchfinder_impl.h" # endif #else diff --git a/2.0/libdeflate/lib/unaligned.h b/2.0/libdeflate/lib/unaligned.h deleted file mode 100644 index bb48bf828..000000000 --- a/2.0/libdeflate/lib/unaligned.h +++ /dev/null @@ -1,228 +0,0 @@ -/* - * unaligned.h - inline functions for unaligned memory accesses - */ - -#ifndef LIB_UNALIGNED_H -#define LIB_UNALIGNED_H - -#include "lib_common.h" - -/***** Unaligned loads and stores without endianness conversion *****/ - -/* - * memcpy() is portable, and it usually gets optimized appropriately by modern - * compilers. I.e., each memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled - * to a load or store instruction, not to an actual function call. - * - * We no longer use the "packed struct" approach, as that is nonstandard, has - * unclear semantics, and doesn't receive enough testing - * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994). - * - * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception - * where memcpy() generates inefficient code - * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer - * consider that one case important enough to maintain different code for. - * If you run into it, please just use a newer version of gcc (or use clang). - */ - -#define DEFINE_UNALIGNED_TYPE(type) \ -static forceinline type \ -load_##type##_unaligned(const void *p) \ -{ \ - type v; \ - memcpy(&v, p, sizeof(v)); \ - return v; \ -} \ - \ -static forceinline void \ -store_##type##_unaligned(type v, void *p) \ -{ \ - memcpy(p, &v, sizeof(v)); \ -} - -DEFINE_UNALIGNED_TYPE(u16) -DEFINE_UNALIGNED_TYPE(u32) -DEFINE_UNALIGNED_TYPE(u64) -DEFINE_UNALIGNED_TYPE(machine_word_t) - -#define load_word_unaligned load_machine_word_t_unaligned -#define store_word_unaligned store_machine_word_t_unaligned - -/***** Unaligned loads with endianness conversion *****/ - -static forceinline u16 -get_unaligned_le16(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return le16_bswap(load_u16_unaligned(p)); - else - return ((u16)p[1] << 8) | p[0]; -} - -static forceinline u16 -get_unaligned_be16(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return be16_bswap(load_u16_unaligned(p)); - else - return ((u16)p[0] << 8) | p[1]; -} - -static forceinline u32 -get_unaligned_le32(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return le32_bswap(load_u32_unaligned(p)); - else - return ((u32)p[3] << 24) | ((u32)p[2] << 16) | - ((u32)p[1] << 8) | p[0]; -} - -static forceinline u32 -get_unaligned_be32(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return be32_bswap(load_u32_unaligned(p)); - else - return ((u32)p[0] << 24) | ((u32)p[1] << 16) | - ((u32)p[2] << 8) | p[3]; -} - -static forceinline u64 -get_unaligned_le64(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return le64_bswap(load_u64_unaligned(p)); - else - return ((u64)p[7] << 56) | ((u64)p[6] << 48) | - ((u64)p[5] << 40) | ((u64)p[4] << 32) | - ((u64)p[3] << 24) | ((u64)p[2] << 16) | - ((u64)p[1] << 8) | p[0]; -} - -static forceinline machine_word_t -get_unaligned_leword(const u8 *p) -{ - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - if (WORDBITS == 32) - return get_unaligned_le32(p); - else - return get_unaligned_le64(p); -} - -/***** Unaligned stores with endianness conversion *****/ - -static forceinline void -put_unaligned_le16(u16 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u16_unaligned(le16_bswap(v), p); - } else { - p[0] = (u8)(v >> 0); - p[1] = (u8)(v >> 8); - } -} - -static forceinline void -put_unaligned_be16(u16 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u16_unaligned(be16_bswap(v), p); - } else { - p[0] = (u8)(v >> 8); - p[1] = (u8)(v >> 0); - } -} - -static forceinline void -put_unaligned_le32(u32 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u32_unaligned(le32_bswap(v), p); - } else { - p[0] = (u8)(v >> 0); - p[1] = (u8)(v >> 8); - p[2] = (u8)(v >> 16); - p[3] = (u8)(v >> 24); - } -} - -static forceinline void -put_unaligned_be32(u32 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u32_unaligned(be32_bswap(v), p); - } else { - p[0] = (u8)(v >> 24); - p[1] = (u8)(v >> 16); - p[2] = (u8)(v >> 8); - p[3] = (u8)(v >> 0); - } -} - -static forceinline void -put_unaligned_le64(u64 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u64_unaligned(le64_bswap(v), p); - } else { - p[0] = (u8)(v >> 0); - p[1] = (u8)(v >> 8); - p[2] = (u8)(v >> 16); - p[3] = (u8)(v >> 24); - p[4] = (u8)(v >> 32); - p[5] = (u8)(v >> 40); - p[6] = (u8)(v >> 48); - p[7] = (u8)(v >> 56); - } -} - -static forceinline void -put_unaligned_leword(machine_word_t v, u8 *p) -{ - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - if (WORDBITS == 32) - put_unaligned_le32(v, p); - else - put_unaligned_le64(v, p); -} - -/***** 24-bit loads *****/ - -/* - * Given a 32-bit value that was loaded with the platform's native endianness, - * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 - * bits contain the first 3 bytes, arranged in octets in a platform-dependent - * order, at the memory location from which the input 32-bit value was loaded. - */ -static forceinline u32 -loaded_u32_to_u24(u32 v) -{ - if (CPU_IS_LITTLE_ENDIAN()) - return v & 0xFFFFFF; - else - return v >> 8; -} - -/* - * Load the next 3 bytes from the memory location @p into the 24 low-order bits - * of a 32-bit value. The order in which the 3 bytes will be arranged as octets - * in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES - * bytes must be available at @p; note that this may be more than 3. - */ -static forceinline u32 -load_u24_unaligned(const u8 *p) -{ -#if UNALIGNED_ACCESS_IS_FAST -# define LOAD_U24_REQUIRED_NBYTES 4 - return loaded_u32_to_u24(load_u32_unaligned(p)); -#else -# define LOAD_U24_REQUIRED_NBYTES 3 - if (CPU_IS_LITTLE_ENDIAN()) - return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); - else - return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); -#endif -} - -#endif /* LIB_UNALIGNED_H */ diff --git a/2.0/libdeflate/lib/utils.c b/2.0/libdeflate/lib/utils.c index 46826ef48..c1e4cc26f 100644 --- a/2.0/libdeflate/lib/utils.c +++ b/2.0/libdeflate/lib/utils.c @@ -27,8 +27,6 @@ #include "lib_common.h" -#include "libdeflate.h" - #ifdef FREESTANDING # define malloc NULL # define free NULL @@ -36,27 +34,18 @@ # include #endif -static void *(*libdeflate_malloc_func)(size_t) = malloc; -static void (*libdeflate_free_func)(void *) = free; +malloc_func_t libdeflate_default_malloc_func = malloc; +free_func_t libdeflate_default_free_func = free; void * -libdeflate_malloc(size_t size) +libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size) { - return (*libdeflate_malloc_func)(size); -} + void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size); -void -libdeflate_free(void *ptr) -{ - (*libdeflate_free_func)(ptr); -} - -void * -libdeflate_aligned_malloc(size_t alignment, size_t size) -{ - void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size); if (ptr) { void *orig_ptr = ptr; + ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); ((void **)ptr)[-1] = orig_ptr; } @@ -64,18 +53,17 @@ libdeflate_aligned_malloc(size_t alignment, size_t size) } void -libdeflate_aligned_free(void *ptr) +libdeflate_aligned_free(free_func_t free_func, void *ptr) { - if (ptr) - libdeflate_free(((void **)ptr)[-1]); + (*free_func)(((void **)ptr)[-1]); } -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), - void (*free_func)(void *)) +LIBDEFLATEAPI void +libdeflate_set_memory_allocator(malloc_func_t malloc_func, + free_func_t free_func) { - libdeflate_malloc_func = malloc_func; - libdeflate_free_func = free_func; + libdeflate_default_malloc_func = malloc_func; + libdeflate_default_free_func = free_func; } /* diff --git a/2.0/libdeflate/lib/x86/adler32_impl.h b/2.0/libdeflate/lib/x86/adler32_impl.h index 52a0c5b04..6285dc80a 100644 --- a/2.0/libdeflate/lib/x86/adler32_impl.h +++ b/2.0/libdeflate/lib/x86/adler32_impl.h @@ -43,30 +43,46 @@ #define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \ { \ - __v4su s1_last = (v_s1), s2_last = (v_s2); \ + __m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2); \ \ /* 128 => 32 bits */ \ - s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \ - s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \ - s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \ + s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \ + s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \ + s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \ \ - *(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \ - *(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \ + *(s1) += (u32)_mm_cvtsi128_si32(s1_last); \ + *(s2) += (u32)_mm_cvtsi128_si32(s2_last); \ } #define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \ { \ - __v4su s1_128bit, s2_128bit; \ + __m128i /* __v4su */ s1_128bit, s2_128bit; \ \ /* 256 => 128 bits */ \ - s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \ - (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1); \ - s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \ - (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1); \ + s1_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s1), 0), \ + _mm256_extracti128_si256((v_s1), 1)); \ + s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0), \ + _mm256_extracti128_si256((v_s2), 1)); \ \ ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \ } +/* + * This is a very silly partial workaround for gcc bug + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to + * generate extra move instructions in some loops containing vector intrinsics. + * + * An alternate workaround would be to use gcc native vector operations instead + * of vector intrinsics. But that would result in MSVC needing its own code. + */ +#if GCC_PREREQ(1, 0) +# define GCC_UPDATE_VARS(a, b, c, d, e, f) \ + __asm__("" : "+x" (a), "+x" (b), "+x" (c), "+x" (d), "+x" (e), "+x" (f)) +#else +# define GCC_UPDATE_VARS(a, b, c, d, e, f) \ + (void)a, (void)b, (void)c, (void)d, (void)e, (void)f +#endif + /* SSE2 implementation */ #if HAVE_SSE2_INTRIN # define adler32_sse2 adler32_sse2 @@ -83,29 +99,37 @@ # if HAVE_SSE2_NATIVE # define ATTRIBUTES # else -# define ATTRIBUTES __attribute__((target("sse2"))) +# define ATTRIBUTES _target_attribute("sse2") # endif # include static forceinline ATTRIBUTES void adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) { const __m128i zeroes = _mm_setzero_si128(); + const __m128i /* __v8hu */ mults_a = + _mm_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25); + const __m128i /* __v8hu */ mults_b = + _mm_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17); + const __m128i /* __v8hu */ mults_c = + _mm_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9); + const __m128i /* __v8hu */ mults_d = + _mm_setr_epi16(8, 7, 6, 5, 4, 3, 2, 1); /* s1 counters: 32-bit, sum of bytes */ - __v4su v_s1 = (__v4su)zeroes; + __m128i /* __v4su */ v_s1 = zeroes; /* s2 counters: 32-bit, sum of s1 values */ - __v4su v_s2 = (__v4su)zeroes; + __m128i /* __v4su */ v_s2 = zeroes; /* * Thirty-two 16-bit counters for byte sums. Each accumulates the bytes * that eventually need to be multiplied by a number 32...1 for addition * into s2. */ - __v8hu v_byte_sums_a = (__v8hu)zeroes; - __v8hu v_byte_sums_b = (__v8hu)zeroes; - __v8hu v_byte_sums_c = (__v8hu)zeroes; - __v8hu v_byte_sums_d = (__v8hu)zeroes; + __m128i /* __v8hu */ v_byte_sums_a = zeroes; + __m128i /* __v8hu */ v_byte_sums_b = zeroes; + __m128i /* __v8hu */ v_byte_sums_c = zeroes; + __m128i /* __v8hu */ v_byte_sums_d = zeroes; do { /* Load the next 32 bytes. */ @@ -117,37 +141,39 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) * Logically, this really should be v_s2 += v_s1 * 32, but we * can do the multiplication (or left shift) later. */ - v_s2 += v_s1; + v_s2 = _mm_add_epi32(v_s2, v_s1); /* * s1 update: use "Packed Sum of Absolute Differences" to add * the bytes horizontally with 8 bytes per sum. Then add the * sums to the s1 counters. */ - v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes); - v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes); + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zeroes)); + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zeroes)); /* * Also accumulate the bytes into 32 separate counters that have * 16-bit precision. */ - v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes); - v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes); - v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes); - v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes); + v_byte_sums_a = _mm_add_epi16( + v_byte_sums_a, _mm_unpacklo_epi8(bytes1, zeroes)); + v_byte_sums_b = _mm_add_epi16( + v_byte_sums_b, _mm_unpackhi_epi8(bytes1, zeroes)); + v_byte_sums_c = _mm_add_epi16( + v_byte_sums_c, _mm_unpacklo_epi8(bytes2, zeroes)); + v_byte_sums_d = _mm_add_epi16( + v_byte_sums_d, _mm_unpackhi_epi8(bytes2, zeroes)); + GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b, + v_byte_sums_c, v_byte_sums_d); } while (p != end); /* Finish calculating the s2 counters. */ - v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a, - (__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 }); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b, - (__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 }); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c, - (__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 }); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d, - (__m128i)(__v8hu){ 8, 7, 6, 5, 4, 3, 2, 1 }); + v_s2 = _mm_slli_epi32(v_s2, 5); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_a, mults_a)); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_b, mults_b)); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_c, mults_c)); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d)); /* Add the counters to the real s1 and s2. */ ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2); @@ -169,9 +195,17 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) # if HAVE_AVX2_NATIVE # define ATTRIBUTES # else -# define ATTRIBUTES __attribute__((target("avx2"))) +# define ATTRIBUTES _target_attribute("avx2") # endif # include + /* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +# if defined(__clang__) && defined(_MSC_VER) +# include +# include +# endif static forceinline ATTRIBUTES void adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) { @@ -180,43 +214,50 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) * Note, the multipliers have to be in this order because * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately. */ - const __v16hu mults_a = { 64, 63, 62, 61, 60, 59, 58, 57, - 48, 47, 46, 45, 44, 43, 42, 41, }; - const __v16hu mults_b = { 56, 55, 54, 53, 52, 51, 50, 49, - 40, 39, 38, 37, 36, 35, 34, 33, }; - const __v16hu mults_c = { 32, 31, 30, 29, 28, 27, 26, 25, - 16, 15, 14, 13, 12, 11, 10, 9, }; - const __v16hu mults_d = { 24, 23, 22, 21, 20, 19, 18, 17, - 8, 7, 6, 5, 4, 3, 2, 1, }; - __v8su v_s1 = (__v8su)zeroes; - __v8su v_s2 = (__v8su)zeroes; - __v16hu v_byte_sums_a = (__v16hu)zeroes; - __v16hu v_byte_sums_b = (__v16hu)zeroes; - __v16hu v_byte_sums_c = (__v16hu)zeroes; - __v16hu v_byte_sums_d = (__v16hu)zeroes; + const __m256i /* __v16hu */ mults_a = + _mm256_setr_epi16(64, 63, 62, 61, 60, 59, 58, 57, + 48, 47, 46, 45, 44, 43, 42, 41); + const __m256i /* __v16hu */ mults_b = + _mm256_setr_epi16(56, 55, 54, 53, 52, 51, 50, 49, + 40, 39, 38, 37, 36, 35, 34, 33); + const __m256i /* __v16hu */ mults_c = + _mm256_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25, + 16, 15, 14, 13, 12, 11, 10, 9); + const __m256i /* __v16hu */ mults_d = + _mm256_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17, + 8, 7, 6, 5, 4, 3, 2, 1); + __m256i /* __v8su */ v_s1 = zeroes; + __m256i /* __v8su */ v_s2 = zeroes; + __m256i /* __v16hu */ v_byte_sums_a = zeroes; + __m256i /* __v16hu */ v_byte_sums_b = zeroes; + __m256i /* __v16hu */ v_byte_sums_c = zeroes; + __m256i /* __v16hu */ v_byte_sums_d = zeroes; do { const __m256i bytes1 = *p++; const __m256i bytes2 = *p++; - v_s2 += v_s1; - v_s1 += (__v8su)_mm256_sad_epu8(bytes1, zeroes); - v_s1 += (__v8su)_mm256_sad_epu8(bytes2, zeroes); - v_byte_sums_a += (__v16hu)_mm256_unpacklo_epi8(bytes1, zeroes); - v_byte_sums_b += (__v16hu)_mm256_unpackhi_epi8(bytes1, zeroes); - v_byte_sums_c += (__v16hu)_mm256_unpacklo_epi8(bytes2, zeroes); - v_byte_sums_d += (__v16hu)_mm256_unpackhi_epi8(bytes2, zeroes); + v_s2 = _mm256_add_epi32(v_s2, v_s1); + v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes1, zeroes)); + v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes2, zeroes)); + v_byte_sums_a = _mm256_add_epi16( + v_byte_sums_a, _mm256_unpacklo_epi8(bytes1, zeroes)); + v_byte_sums_b = _mm256_add_epi16( + v_byte_sums_b, _mm256_unpackhi_epi8(bytes1, zeroes)); + v_byte_sums_c = _mm256_add_epi16( + v_byte_sums_c, _mm256_unpacklo_epi8(bytes2, zeroes)); + v_byte_sums_d = _mm256_add_epi16( + v_byte_sums_d, _mm256_unpackhi_epi8(bytes2, zeroes)); + + GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b, + v_byte_sums_c, v_byte_sums_d); } while (p != end); - v_s2 = (__v8su)_mm256_slli_epi32((__m256i)v_s2, 6); - v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_a, - (__m256i)mults_a); - v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_b, - (__m256i)mults_b); - v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_c, - (__m256i)mults_c); - v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_d, - (__m256i)mults_d); + v_s2 = _mm256_slli_epi32(v_s2, 6); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_a, mults_a)); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b)); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c)); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d)); ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2); } # include "../adler32_vec_template.h" diff --git a/2.0/libdeflate/lib/x86/cpu_features.h b/2.0/libdeflate/lib/x86/cpu_features.h index 796cfa8d9..ad14e435a 100644 --- a/2.0/libdeflate/lib/x86/cpu_features.h +++ b/2.0/libdeflate/lib/x86/cpu_features.h @@ -32,9 +32,9 @@ #define HAVE_DYNAMIC_X86_CPU_FEATURES 0 -#if defined(__i386__) || defined(__x86_64__) +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) -#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE +#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER) # undef HAVE_DYNAMIC_X86_CPU_FEATURES # define HAVE_DYNAMIC_X86_CPU_FEATURES 1 #endif @@ -73,55 +73,36 @@ static inline u32 get_x86_cpu_features(void) { return 0; } * functions. Unfortunately clang has no feature test macro for this, so we * have to check its version. */ -#define HAVE_TARGET_INTRINSICS \ - (HAVE_DYNAMIC_X86_CPU_FEATURES && \ - (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000))) - -/* - * Before gcc 5.1 and clang 3.9, emmintrin.h only defined vectors of signed - * integers (e.g. __v4si), not vectors of unsigned integers (e.g. __v4su). We - * need the unsigned ones to avoid signed integer overflow, which is undefined - * behavior. Add the missing definitions for the unsigned ones if needed. - */ -#if (GCC_PREREQ(4, 0) && !GCC_PREREQ(5, 1)) || \ - (defined(__clang__) && !CLANG_PREREQ(3, 9, 8020000)) || \ - defined(__INTEL_COMPILER) -typedef unsigned long long __v2du __attribute__((__vector_size__(16))); -typedef unsigned int __v4su __attribute__((__vector_size__(16))); -typedef unsigned short __v8hu __attribute__((__vector_size__(16))); -typedef unsigned char __v16qu __attribute__((__vector_size__(16))); -typedef unsigned long long __v4du __attribute__((__vector_size__(32))); -typedef unsigned int __v8su __attribute__((__vector_size__(32))); -typedef unsigned short __v16hu __attribute__((__vector_size__(32))); -typedef unsigned char __v32qu __attribute__((__vector_size__(32))); -#endif -#ifdef __INTEL_COMPILER -typedef int __v16si __attribute__((__vector_size__(64))); -typedef short __v32hi __attribute__((__vector_size__(64))); -typedef char __v64qi __attribute__((__vector_size__(64))); +#if HAVE_DYNAMIC_X86_CPU_FEATURES && \ + (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER)) +# define HAVE_TARGET_INTRINSICS 1 +#else +# define HAVE_TARGET_INTRINSICS 0 #endif /* SSE2 */ -#ifdef __SSE2__ +#if defined(__SSE2__) || \ + (defined(_MSC_VER) && \ + (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) # define HAVE_SSE2_NATIVE 1 #else # define HAVE_SSE2_NATIVE 0 #endif -#define HAVE_SSE2_TARGET HAVE_DYNAMIC_X86_CPU_FEATURES -#define HAVE_SSE2_INTRIN \ - (HAVE_SSE2_NATIVE || (HAVE_SSE2_TARGET && HAVE_TARGET_INTRINSICS)) +#define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS) /* PCLMUL */ -#ifdef __PCLMUL__ +#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) # define HAVE_PCLMUL_NATIVE 1 #else # define HAVE_PCLMUL_NATIVE 0 #endif -#define HAVE_PCLMUL_TARGET \ - (HAVE_DYNAMIC_X86_CPU_FEATURES && \ - (GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))) -#define HAVE_PCLMUL_INTRIN \ - (HAVE_PCLMUL_NATIVE || (HAVE_PCLMUL_TARGET && HAVE_TARGET_INTRINSICS)) +#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ + defined(_MSC_VER))) +# define HAVE_PCLMUL_INTRIN 1 +#else +# define HAVE_PCLMUL_INTRIN 0 +#endif /* AVX */ #ifdef __AVX__ @@ -129,11 +110,13 @@ typedef char __v64qi __attribute__((__vector_size__(64))); #else # define HAVE_AVX_NATIVE 0 #endif -#define HAVE_AVX_TARGET \ - (HAVE_DYNAMIC_X86_CPU_FEATURES && \ - (GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))) -#define HAVE_AVX_INTRIN \ - (HAVE_AVX_NATIVE || (HAVE_AVX_TARGET && HAVE_TARGET_INTRINSICS)) +#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \ + defined(_MSC_VER))) +# define HAVE_AVX_INTRIN 1 +#else +# define HAVE_AVX_INTRIN 0 +#endif /* AVX2 */ #ifdef __AVX2__ @@ -141,24 +124,38 @@ typedef char __v64qi __attribute__((__vector_size__(64))); #else # define HAVE_AVX2_NATIVE 0 #endif -#define HAVE_AVX2_TARGET \ - (HAVE_DYNAMIC_X86_CPU_FEATURES && \ - (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256))) -#define HAVE_AVX2_INTRIN \ - (HAVE_AVX2_NATIVE || (HAVE_AVX2_TARGET && HAVE_TARGET_INTRINSICS)) +#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER))) +# define HAVE_AVX2_INTRIN 1 +#else +# define HAVE_AVX2_INTRIN 0 +#endif /* BMI2 */ -#ifdef __BMI2__ +#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) # define HAVE_BMI2_NATIVE 1 #else # define HAVE_BMI2_NATIVE 0 #endif -#define HAVE_BMI2_TARGET \ - (HAVE_DYNAMIC_X86_CPU_FEATURES && \ - (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))) -#define HAVE_BMI2_INTRIN \ - (HAVE_BMI2_NATIVE || (HAVE_BMI2_TARGET && HAVE_TARGET_INTRINSICS)) +#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER))) +# define HAVE_BMI2_INTRIN 1 +#else +# define HAVE_BMI2_INTRIN 0 +#endif +/* + * MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() + * intrinsics. It seems to be fixed in VS2022. + */ +#if defined(_MSC_VER) && _MSC_VER < 1930 /* older than VS2022 (toolset v143) */ +# undef HAVE_BMI2_NATIVE +# undef HAVE_BMI2_INTRIN +# define HAVE_BMI2_NATIVE 0 +# define HAVE_BMI2_INTRIN 0 +#endif -#endif /* __i386__ || __x86_64__ */ +#endif /* ARCH_X86_32 || ARCH_X86_64 */ #endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/2.0/libdeflate/lib/x86/crc32_impl.h b/2.0/libdeflate/lib/x86/crc32_impl.h index 10ac8adc8..79cc7944e 100644 --- a/2.0/libdeflate/lib/x86/crc32_impl.h +++ b/2.0/libdeflate/lib/x86/crc32_impl.h @@ -37,7 +37,7 @@ # if HAVE_PCLMUL_NATIVE # define ATTRIBUTES # else -# define ATTRIBUTES __attribute__((target("pclmul"))) +# define ATTRIBUTES _target_attribute("pclmul") # endif # define FOLD_PARTIAL_VECS 0 # include "crc32_pclmul_template.h" @@ -52,16 +52,17 @@ * SSSE3 and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for * efficient handling of partial blocks. (We *could* compile a variant with * PCLMUL+SSSE3+SSE4.1 w/o AVX, but for simplicity we don't currently bother.) + * + * FIXME: with MSVC, this isn't actually compiled with AVX code generation + * enabled yet. That would require that this be moved to its own .c file. */ -#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN && \ - ((HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE) || \ - (HAVE_PCLMUL_TARGET && HAVE_AVX_TARGET)) +#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN # define crc32_x86_pclmul_avx crc32_x86_pclmul_avx # define SUFFIX _pclmul_avx # if HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE # define ATTRIBUTES # else -# define ATTRIBUTES __attribute__((target("pclmul,avx"))) +# define ATTRIBUTES _target_attribute("pclmul,avx") # endif # define FOLD_PARTIAL_VECS 1 # include "crc32_pclmul_template.h" diff --git a/2.0/libdeflate/lib/x86/crc32_pclmul_template.h b/2.0/libdeflate/lib/x86/crc32_pclmul_template.h index a8ea91b41..1d5782375 100644 --- a/2.0/libdeflate/lib/x86/crc32_pclmul_template.h +++ b/2.0/libdeflate/lib/x86/crc32_pclmul_template.h @@ -49,10 +49,19 @@ */ #include +/* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +#if defined(__clang__) && defined(_MSC_VER) +# include +# include +# include +#endif #undef fold_vec static forceinline ATTRIBUTES __m128i -ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __v2di multipliers) +ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __m128i /* __v2di */ multipliers) { /* * The immediate constant for PCLMULQDQ specifies which 64-bit halves of @@ -61,8 +70,9 @@ ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __v2di multipliers) * 0x00 means low halves (higher degree polynomial terms for us) * 0x11 means high halves (lower degree polynomial terms for us) */ - return dst ^ _mm_clmulepi64_si128(src, multipliers, 0x00) ^ - _mm_clmulepi64_si128(src, multipliers, 0x11); + dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00)); + dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11)); + return dst; } #define fold_vec ADD_SUFFIX(fold_vec) @@ -77,7 +87,7 @@ ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __v2di multipliers) #undef fold_partial_vec static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len, - __v2di multipliers_1) + __m128i /* __v2du */ multipliers_1) { /* * pshufb(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. @@ -115,13 +125,20 @@ ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len, static u32 ATTRIBUTES MAYBE_UNUSED ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) { - const __v2di multipliers_8 = (__v2di)CRC32_8VECS_MULTS; - const __v2di multipliers_4 = (__v2di)CRC32_4VECS_MULTS; - const __v2di multipliers_2 = (__v2di)CRC32_2VECS_MULTS; - const __v2di multipliers_1 = (__v2di)CRC32_1VECS_MULTS; - const __v2di final_multiplier = (__v2di){ CRC32_FINAL_MULT }; - const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF }; - const __v2di barrett_reduction_constants = (__v2di)CRC32_BARRETT_CONSTANTS; + const __m128i /* __v2du */ multipliers_8 = + _mm_set_epi64x(CRC32_8VECS_MULT_2, CRC32_8VECS_MULT_1); + const __m128i /* __v2du */ multipliers_4 = + _mm_set_epi64x(CRC32_4VECS_MULT_2, CRC32_4VECS_MULT_1); + const __m128i /* __v2du */ multipliers_2 = + _mm_set_epi64x(CRC32_2VECS_MULT_2, CRC32_2VECS_MULT_1); + const __m128i /* __v2du */ multipliers_1 = + _mm_set_epi64x(CRC32_1VECS_MULT_2, CRC32_1VECS_MULT_1); + const __m128i /* __v2du */ final_multiplier = + _mm_set_epi64x(0, CRC32_FINAL_MULT); + const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); + const __m128i /* __v2du */ barrett_reduction_constants = + _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, + CRC32_BARRETT_CONSTANT_1); __m128i v0, v1, v2, v3, v4, v5, v6, v7; /* @@ -135,7 +152,8 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) if (len < 16) return crc32_slice1(crc, p, len); - v0 = _mm_loadu_si128((const void *)p) ^ (__m128i)(__v4si){crc}; + v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), + _mm_cvtsi32_si128(crc)); p += 16; if (len >= 64) { @@ -187,7 +205,8 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) const __m128i *vp; #if FOLD_PARTIAL_VECS - v0 = _mm_loadu_si128((const void *)p) ^ (__m128i)(__v4si){crc}; + v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), + _mm_cvtsi32_si128(crc)); p += 16; /* Align p to the next 16-byte boundary. */ if (align) { @@ -204,7 +223,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) len -= align; } vp = (const __m128i *)p; - v0 = *vp++ ^ (__m128i)(__v4si){crc}; + v0 = _mm_xor_si128(*vp++, _mm_cvtsi32_si128(crc)); #endif v1 = *vp++; v2 = *vp++; @@ -265,12 +284,13 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * which is equivalent to multiplying by x^32. This is needed because * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). */ - v0 = _mm_srli_si128(v0, 8) ^ - _mm_clmulepi64_si128(v0, multipliers_1, 0x10); + v0 = _mm_xor_si128(_mm_srli_si128(v0, 8), + _mm_clmulepi64_si128(v0, multipliers_1, 0x10)); /* Fold 96 => 64 bits. */ - v0 = _mm_srli_si128(v0, 4) ^ - _mm_clmulepi64_si128(v0 & mask32, final_multiplier, 0x00); + v0 = _mm_xor_si128(_mm_srli_si128(v0, 4), + _mm_clmulepi64_si128(_mm_and_si128(v0, mask32), + final_multiplier, 0x00)); /* * Reduce 64 => 32 bits using Barrett reduction. @@ -314,10 +334,15 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * R(x) = B(x) + G(x)*floor ( ------------------------- ) * \ x^32 / */ - v1 = _mm_clmulepi64_si128(v0 & mask32, barrett_reduction_constants, 0x00); - v1 = _mm_clmulepi64_si128(v1 & mask32, barrett_reduction_constants, 0x10); - crc = ((__v4si)(v0 ^ v1))[1]; -#if !FOLD_PARTIAL_VECS + v1 = _mm_clmulepi64_si128(_mm_and_si128(v0, mask32), + barrett_reduction_constants, 0x00); + v1 = _mm_clmulepi64_si128(_mm_and_si128(v1, mask32), + barrett_reduction_constants, 0x10); + v0 = _mm_xor_si128(v0, v1); +#if FOLD_PARTIAL_VECS + crc = _mm_extract_epi32(v0, 1); +#else + crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(v0, 0x01)); /* Process up to 15 bytes left over at the end. */ crc = crc32_slice1(crc, p, len); #endif diff --git a/2.0/libdeflate/lib/x86/decompress_impl.h b/2.0/libdeflate/lib/x86/decompress_impl.h index 3dc189285..3e2ec37e7 100644 --- a/2.0/libdeflate/lib/x86/decompress_impl.h +++ b/2.0/libdeflate/lib/x86/decompress_impl.h @@ -3,12 +3,17 @@ #include "cpu_features.h" -/* BMI2 optimized version */ +/* + * BMI2 optimized version + * + * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation + * enabled yet. That would require that this be moved to its own .c file. + */ #if HAVE_BMI2_INTRIN # define deflate_decompress_bmi2 deflate_decompress_bmi2 # define FUNCNAME deflate_decompress_bmi2 # if !HAVE_BMI2_NATIVE -# define ATTRIBUTES __attribute__((target("bmi2"))) +# define ATTRIBUTES _target_attribute("bmi2") # endif /* * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the @@ -20,7 +25,7 @@ */ # ifndef __clang__ # include -# ifdef __x86_64__ +# ifdef ARCH_X86_64 # define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count)) # define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count)) # else diff --git a/2.0/libdeflate/lib/x86/x86_cpu_features.c b/2.0/libdeflate/lib/x86/x86_cpu_features.c index c9dd88a53..d51f947d6 100644 --- a/2.0/libdeflate/lib/x86/x86_cpu_features.c +++ b/2.0/libdeflate/lib/x86/x86_cpu_features.c @@ -30,50 +30,60 @@ #if HAVE_DYNAMIC_X86_CPU_FEATURES -/* With old GCC versions we have to manually save and restore the x86_32 PIC - * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */ -#if defined(__i386__) && defined(__PIC__) +/* + * With old GCC versions we have to manually save and restore the x86_32 PIC + * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 + */ +#if defined(ARCH_X86_32) && defined(__PIC__) # define EBX_CONSTRAINT "=&r" #else # define EBX_CONSTRAINT "=b" #endif -/* Execute the CPUID instruction. */ +/* Execute the CPUID instruction. */ static inline void cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) { - __asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" - "cpuid \n" - ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" - : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) - : "a" (leaf), "c" (subleaf)); +#ifdef _MSC_VER + int result[4]; + + __cpuidex(result, leaf, subleaf); + *a = result[0]; + *b = result[1]; + *c = result[2]; + *d = result[3]; +#else + __asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" + "cpuid \n" + ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" + : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "a" (leaf), "c" (subleaf)); +#endif } -/* Read an extended control register. */ +/* Read an extended control register. */ static inline u64 read_xcr(u32 indx) { - u32 edx, eax; - - /* Execute the "xgetbv" instruction. Old versions of binutils do not - * recognize this instruction, so list the raw bytes instead. */ - __asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (indx)); - - return ((u64)edx << 32) | eax; +#ifdef _MSC_VER + return _xgetbv(indx); +#else + u32 d, a; + + /* + * Execute the "xgetbv" instruction. Old versions of binutils do not + * recognize this instruction, so list the raw bytes instead. + * + * This must be 'volatile' to prevent this code from being moved out + * from under the check for OSXSAVE. + */ + __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : + "=d" (d), "=a" (a) : "c" (indx)); + + return ((u64)d << 32) | a; +#endif } -#undef BIT -#define BIT(nr) (1UL << (nr)) - -#define XCR0_BIT_SSE BIT(1) -#define XCR0_BIT_AVX BIT(2) -#define XCR0_BIT_OPMASK BIT(5) -#define XCR0_BIT_ZMM_HI256 BIT(6) -#define XCR0_BIT_HI16_ZMM BIT(7) - -#define IS_SET(reg, nr) ((reg) & BIT(nr)) -#define IS_ALL_SET(reg, mask) (((reg) & (mask)) == (mask)) - static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_SSE2, "sse2"}, {X86_CPU_FEATURE_PCLMUL, "pclmul"}, @@ -87,47 +97,34 @@ volatile u32 libdeflate_x86_cpu_features = 0; /* Initialize libdeflate_x86_cpu_features. */ void libdeflate_init_x86_cpu_features(void) { + u32 max_leaf, a, b, c, d; + u64 xcr0 = 0; u32 features = 0; - u32 dummy1, dummy2, dummy3, dummy4; - u32 max_function; - u32 features_1, features_2, features_3, features_4; - bool os_avx_support = false; - - /* Get maximum supported function */ - cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4); - if (max_function < 1) - goto out; - /* Standard feature flags */ - cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1); + /* EAX=0: Highest Function Parameter and Manufacturer ID */ + cpuid(0, 0, &max_leaf, &b, &c, &d); + if (max_leaf < 1) + goto out; - if (IS_SET(features_1, 26)) + /* EAX=1: Processor Info and Feature Bits */ + cpuid(1, 0, &a, &b, &c, &d); + if (d & (1 << 26)) features |= X86_CPU_FEATURE_SSE2; - - if (IS_SET(features_2, 1)) + if (c & (1 << 1)) features |= X86_CPU_FEATURE_PCLMUL; - - if (IS_SET(features_2, 27)) { /* OSXSAVE set? */ - u64 xcr0 = read_xcr(0); - - os_avx_support = IS_ALL_SET(xcr0, - XCR0_BIT_SSE | - XCR0_BIT_AVX); - } - - if (os_avx_support && IS_SET(features_2, 28)) + if (c & (1 << 27)) + xcr0 = read_xcr(0); + if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) features |= X86_CPU_FEATURE_AVX; - if (max_function < 7) + if (max_leaf < 7) goto out; - /* Extended feature flags */ - cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4); - - if (os_avx_support && IS_SET(features_3, 5)) + /* EAX=7, ECX=0: Extended Features */ + cpuid(7, 0, &a, &b, &c, &d); + if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6)) features |= X86_CPU_FEATURE_AVX2; - - if (IS_SET(features_3, 8)) + if (b & (1 << 8)) features |= X86_CPU_FEATURE_BMI2; out: diff --git a/2.0/libdeflate/lib/zlib_compress.c b/2.0/libdeflate/lib/zlib_compress.c index bf1dffa87..12d43602d 100644 --- a/2.0/libdeflate/lib/zlib_compress.c +++ b/2.0/libdeflate/lib/zlib_compress.c @@ -28,9 +28,7 @@ #include "deflate_compress.h" #include "zlib_constants.h" -#include "libdeflate.h" - -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_zlib_compress(struct libdeflate_compressor *c, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail) @@ -75,7 +73,7 @@ libdeflate_zlib_compress(struct libdeflate_compressor *c, return out_next - (u8 *)out; } -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_zlib_compress_bound(struct libdeflate_compressor *c, size_t in_nbytes) { diff --git a/2.0/libdeflate/lib/zlib_decompress.c b/2.0/libdeflate/lib/zlib_decompress.c index 6c70603f9..f5e43eaeb 100644 --- a/2.0/libdeflate/lib/zlib_decompress.c +++ b/2.0/libdeflate/lib/zlib_decompress.c @@ -28,9 +28,7 @@ #include "lib_common.h" #include "zlib_constants.h" -#include "libdeflate.h" - -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -94,7 +92,7 @@ libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d, return LIBDEFLATE_SUCCESS; } -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_zlib_decompress(struct libdeflate_decompressor *d, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, diff --git a/2.0/libdeflate/libdeflate.h b/2.0/libdeflate/libdeflate.h index ffe402e2a..0a274f0f3 100644 --- a/2.0/libdeflate/libdeflate.h +++ b/2.0/libdeflate/libdeflate.h @@ -5,45 +5,29 @@ #ifndef LIBDEFLATE_H #define LIBDEFLATE_H +#include +#include + #ifdef __cplusplus extern "C" { #endif #define LIBDEFLATE_VERSION_MAJOR 1 -#define LIBDEFLATE_VERSION_MINOR 14 -#define LIBDEFLATE_VERSION_STRING "1.14" - -#include -#include +#define LIBDEFLATE_VERSION_MINOR 19 +#define LIBDEFLATE_VERSION_STRING "1.19" /* - * On Windows, you must define LIBDEFLATE_STATIC if you are linking to the - * static library version of libdeflate instead of the DLL. On other platforms, - * LIBDEFLATE_STATIC has no effect. + * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause + * __declspec(dllimport) to be used. This should be done when it's easy to do. + * Otherwise it's fine to skip it, since it is a very minor performance + * optimization that is irrelevant for most use cases of libdeflate. */ -#ifdef _WIN32 -# if defined(LIBDEFLATE_STATIC) -# define LIBDEFLATEEXPORT -# elif defined(BUILDING_LIBDEFLATE) -# define LIBDEFLATEEXPORT __declspec(dllexport) +#ifndef LIBDEFLATEAPI +# if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATEAPI __declspec(dllimport) # else -# define LIBDEFLATEEXPORT __declspec(dllimport) +# define LIBDEFLATEAPI # endif -#else -# define LIBDEFLATEEXPORT __attribute__((visibility("default"))) -#endif - -#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && defined(__i386__) - /* - * On i386, gcc assumes that the stack is 16-byte aligned at function entry. - * However, some compilers (e.g. MSVC) and programming languages (e.g. - * Delphi) only guarantee 4-byte alignment when calling functions. Work - * around this ABI incompatibility by realigning the stack pointer when - * entering libdeflate. This prevents crashes in SSE/AVX code. - */ -# define LIBDEFLATEAPI __attribute__((force_align_arg_pointer)) -#else -# define LIBDEFLATEAPI #endif /* ========================================================================== */ @@ -51,6 +35,7 @@ extern "C" { /* ========================================================================== */ struct libdeflate_compressor; +struct libdeflate_options; /* * libdeflate_alloc_compressor() allocates a new compressor that supports @@ -70,15 +55,22 @@ struct libdeflate_compressor; * A single compressor is not safe to use by multiple threads concurrently. * However, different threads may use different compressors concurrently. */ -LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI +LIBDEFLATEAPI struct libdeflate_compressor * libdeflate_alloc_compressor(int compression_level); +/* + * Like libdeflate_alloc_compressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor_ex(int compression_level, + const struct libdeflate_options *options); + /* * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of * data. It attempts to compress 'in_nbytes' bytes of data located at 'in' and - * write the results to 'out', which has space for 'out_nbytes_avail' bytes. - * The return value is the compressed size in bytes, or 0 if the data could not - * be compressed to 'out_nbytes_avail' bytes or fewer (but see note below). + * write the result to 'out', which has space for 'out_nbytes_avail' bytes. The + * return value is the compressed size in bytes, or 0 if the data could not be + * compressed to 'out_nbytes_avail' bytes or fewer. * * If compression is successful, then the output data is guaranteed to be a * valid DEFLATE stream that decompresses to the input data. No other @@ -88,24 +80,8 @@ libdeflate_alloc_compressor(int compression_level); * writing tests that compare compressed data to a golden output, as this can * break when libdeflate is updated. (This property isn't specific to * libdeflate; the same is true for zlib and other compression libraries too.) - * - * Note: due to a performance optimization, libdeflate_deflate_compress() - * currently needs a small amount of slack space at the end of the output - * buffer. As a result, it can't actually report compressed sizes very close to - * 'out_nbytes_avail'. This doesn't matter in real-world use cases, and - * libdeflate_deflate_compress_bound() already includes the slack space. - * However, it does mean that testing code that redundantly compresses data - * using an exact-sized output buffer won't work as might be expected: - * - * out_nbytes = libdeflate_deflate_compress(c, in, in_nbytes, out, - * libdeflate_deflate_compress_bound(in_nbytes)); - * // The following assertion will fail. - * assert(libdeflate_deflate_compress(c, in, in_nbytes, out, out_nbytes) != 0); - * - * To avoid this, either don't write tests like the above, or make sure to - * include at least 9 bytes of slack space in 'out_nbytes_avail'. */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_deflate_compress(struct libdeflate_compressor *compressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail); @@ -114,11 +90,10 @@ libdeflate_deflate_compress(struct libdeflate_compressor *compressor, * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the * number of bytes of compressed data that may be produced by compressing any * buffer of length less than or equal to 'in_nbytes' using - * libdeflate_deflate_compress() with the specified compressor. Mathematically, - * this bound will necessarily be a number greater than or equal to 'in_nbytes'. - * It may be an overestimate of the true upper bound. The return value is - * guaranteed to be the same for all invocations with the same compressor and - * same 'in_nbytes'. + * libdeflate_deflate_compress() with the specified compressor. This bound will + * necessarily be a number greater than or equal to 'in_nbytes'. It may be an + * overestimate of the true upper bound. The return value is guaranteed to be + * the same for all invocations with the same compressor and same 'in_nbytes'. * * As a special case, 'compressor' may be NULL. This causes the bound to be * taken across *any* libdeflate_compressor that could ever be allocated with @@ -135,15 +110,15 @@ libdeflate_deflate_compress(struct libdeflate_compressor *compressor, * libdeflate_deflate_compress() returns 0, indicating that the compressed data * did not fit into the provided output buffer. */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor, size_t in_nbytes); /* - * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper - * format. + * Like libdeflate_deflate_compress(), but uses the zlib wrapper format instead + * of raw DEFLATE. */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_zlib_compress(struct libdeflate_compressor *compressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail); @@ -153,15 +128,15 @@ libdeflate_zlib_compress(struct libdeflate_compressor *compressor, * compressed with libdeflate_zlib_compress() rather than with * libdeflate_deflate_compress(). */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor, size_t in_nbytes); /* - * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper - * format. + * Like libdeflate_deflate_compress(), but uses the gzip wrapper format instead + * of raw DEFLATE. */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_gzip_compress(struct libdeflate_compressor *compressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail); @@ -171,7 +146,7 @@ libdeflate_gzip_compress(struct libdeflate_compressor *compressor, * compressed with libdeflate_gzip_compress() rather than with * libdeflate_deflate_compress(). */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +LIBDEFLATEAPI size_t libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, size_t in_nbytes); @@ -180,7 +155,7 @@ libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is * taken. */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI +LIBDEFLATEAPI void libdeflate_free_compressor(struct libdeflate_compressor *compressor); /* ========================================================================== */ @@ -188,6 +163,7 @@ libdeflate_free_compressor(struct libdeflate_compressor *compressor); /* ========================================================================== */ struct libdeflate_decompressor; +struct libdeflate_options; /* * libdeflate_alloc_decompressor() allocates a new decompressor that can be used @@ -201,9 +177,15 @@ struct libdeflate_decompressor; * A single decompressor is not safe to use by multiple threads concurrently. * However, different threads may use different decompressors concurrently. */ -LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI +LIBDEFLATEAPI struct libdeflate_decompressor * libdeflate_alloc_decompressor(void); +/* + * Like libdeflate_alloc_decompressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options); + /* * Result of a call to libdeflate_deflate_decompress(), * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). @@ -212,8 +194,8 @@ enum libdeflate_result { /* Decompression was successful. */ LIBDEFLATE_SUCCESS = 0, - /* Decompressed failed because the compressed data was invalid, corrupt, - * or otherwise unsupported. */ + /* Decompression failed because the compressed data was invalid, + * corrupt, or otherwise unsupported. */ LIBDEFLATE_BAD_DATA = 1, /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have @@ -226,13 +208,12 @@ enum libdeflate_result { }; /* - * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream - * from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The - * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail' - * bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. - * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If - * a nonzero result code is returned, then the contents of the output buffer are - * undefined. + * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer + * 'in' with compressed size up to 'in_nbytes' bytes. The uncompressed data is + * written to 'out', a buffer with size 'out_nbytes_avail' bytes. If + * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. Otherwise, + * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the + * contents of the output buffer are undefined. * * Decompression stops at the end of the DEFLATE stream (as indicated by the * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. @@ -257,7 +238,7 @@ enum libdeflate_result { * not large enough but no other problems were encountered, or another * nonzero result code if decompression failed for another reason. */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -269,7 +250,7 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, * then the actual compressed size of the DEFLATE stream (aligned to the next * byte boundary) is written to *actual_in_nbytes_ret. */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -284,7 +265,7 @@ libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, * than 'in_nbytes'. If you need to know exactly where the zlib stream ended, * use libdeflate_zlib_decompress_ex(). */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -297,7 +278,7 @@ libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, * buffer was decompressed), then the actual number of input bytes consumed is * written to *actual_in_nbytes_ret. */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -312,7 +293,7 @@ libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor, * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need * multi-member support. */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -325,7 +306,7 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, * buffer was decompressed), then the actual number of input bytes consumed is * written to *actual_in_nbytes_ret. */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +LIBDEFLATEAPI enum libdeflate_result libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, @@ -337,7 +318,7 @@ libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action * is taken. */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI +LIBDEFLATEAPI void libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); /* ========================================================================== */ @@ -350,7 +331,7 @@ libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); * required initial value for 'adler' is 1. This value is also returned when * 'buffer' is specified as NULL. */ -LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI +LIBDEFLATEAPI uint32_t libdeflate_adler32(uint32_t adler, const void *buffer, size_t len); @@ -360,7 +341,7 @@ libdeflate_adler32(uint32_t adler, const void *buffer, size_t len); * initial value for 'crc' is 0. This value is also returned when 'buffer' is * specified as NULL. */ -LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI +LIBDEFLATEAPI uint32_t libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); /* ========================================================================== */ @@ -369,16 +350,60 @@ libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); /* * Install a custom memory allocator which libdeflate will use for all memory - * allocations. 'malloc_func' is a function that must behave like malloc(), and - * 'free_func' is a function that must behave like free(). + * allocations by default. 'malloc_func' is a function that must behave like + * malloc(), and 'free_func' is a function that must behave like free(). + * + * The per-(de)compressor custom memory allocator that can be specified in + * 'struct libdeflate_options' takes priority over this. * - * There must not be any libdeflate_compressor or libdeflate_decompressor - * structures in existence when calling this function. + * This doesn't affect the free() function that will be used to free + * (de)compressors that were already in existence when this is called. */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI +LIBDEFLATEAPI void libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), void (*free_func)(void *)); +/* + * Advanced options. This is the options structure that + * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex() + * require. Most users won't need this and should just use the non-"_ex" + * functions instead. If you do need this, it should be initialized like this: + * + * struct libdeflate_options options; + * + * memset(&options, 0, sizeof(options)); + * options.sizeof_options = sizeof(options); + * // Then set the fields that you need to override the defaults for. + */ +struct libdeflate_options { + + /* + * This field must be set to the struct size. This field exists for + * extensibility, so that fields can be appended to this struct in + * future versions of libdeflate while still supporting old binaries. + */ + size_t sizeof_options; + + /* + * An optional custom memory allocator to use for this (de)compressor. + * 'malloc_func' must be a function that behaves like malloc(), and + * 'free_func' must be a function that behaves like free(). + * + * This is useful in cases where a process might have multiple users of + * libdeflate who want to use different memory allocators. For example, + * a library might want to use libdeflate with a custom memory allocator + * without interfering with user code that might use libdeflate too. + * + * This takes priority over the "global" memory allocator (which by + * default is malloc() and free(), but can be changed by + * libdeflate_set_memory_allocator()). Moreover, libdeflate will never + * call the "global" memory allocator if a per-(de)compressor custom + * allocator is always given. + */ + void *(*malloc_func)(size_t); + void (*free_func)(void *); +}; + #ifdef __cplusplus } #endif diff --git a/2.0/plink2.cc b/2.0/plink2.cc index 72bdec56e..3f771c9a2 100644 --- a/2.0/plink2.cc +++ b/2.0/plink2.cc @@ -72,7 +72,7 @@ static const char ver_str[] = "PLINK v2.00a6" #elif defined(USE_AOCL) " AMD" #endif - " (27 Sep 2023)"; + " (28 Sep 2023)"; static const char ver_str2[] = // include leading space if day < 10, so character length stays the same ""