From d67c3aeca643a1527ac1924f0707701f8706dc6c Mon Sep 17 00:00:00 2001
From: Christopher Chang <chrchang523@gmail.com>
Date: Thu, 28 Sep 2023 11:33:18 -0700
Subject: [PATCH] libdeflate 1.19, --clump test

---
 2.0/Tests/TEST_PHASED_VCF/glm_compare.py      |  27 +-
 2.0/Tests/TEST_PHASED_VCF/run_tests.sh        |  24 +-
 2.0/libdeflate/common_defs.h                  | 156 +++-
 2.0/libdeflate/lib/adler32.c                  |   7 +-
 2.0/libdeflate/lib/arm/adler32_impl.h         | 132 +--
 2.0/libdeflate/lib/arm/arm_cpu_features.c     |  28 +-
 2.0/libdeflate/lib/arm/cpu_features.h         | 155 ++--
 2.0/libdeflate/lib/arm/crc32_impl.h           | 142 +--
 2.0/libdeflate/lib/arm/crc32_pmull_helpers.h  |  58 +-
 2.0/libdeflate/lib/arm/crc32_pmull_wide.h     |  34 +-
 2.0/libdeflate/lib/arm/matchfinder_impl.h     |  13 +-
 2.0/libdeflate/lib/bt_matchfinder.h           |   5 +-
 2.0/libdeflate/lib/cpu_features_common.h      |   6 +-
 2.0/libdeflate/lib/crc32.c                    |   7 +-
 2.0/libdeflate/lib/crc32_table.h              | 526 ------------
 2.0/libdeflate/lib/crc32_vec_template.h       |  61 --
 2.0/libdeflate/lib/decompress_template.h      |  13 +-
 2.0/libdeflate/lib/deflate_compress.c         | 806 ++++++++++++------
 2.0/libdeflate/lib/deflate_decompress.c       | 114 ++-
 2.0/libdeflate/lib/gzip_compress.c            |   6 +-
 2.0/libdeflate/lib/gzip_decompress.c          |   6 +-
 2.0/libdeflate/lib/hc_matchfinder.h           |   5 +-
 2.0/libdeflate/lib/ht_matchfinder.h           |   4 +-
 2.0/libdeflate/lib/lib_common.h               |  41 +-
 2.0/libdeflate/lib/matchfinder_common.h       |   4 +-
 2.0/libdeflate/lib/unaligned.h                | 228 -----
 2.0/libdeflate/lib/utils.c                    |  38 +-
 2.0/libdeflate/lib/x86/adler32_impl.h         | 171 ++--
 2.0/libdeflate/lib/x86/cpu_features.h         | 107 ++-
 2.0/libdeflate/lib/x86/crc32_impl.h           |  11 +-
 .../lib/x86/crc32_pclmul_template.h           |  69 +-
 2.0/libdeflate/lib/x86/decompress_impl.h      |  11 +-
 2.0/libdeflate/lib/x86/x86_cpu_features.c     | 117 ++-
 2.0/libdeflate/lib/zlib_compress.c            |   6 +-
 2.0/libdeflate/lib/zlib_decompress.c          |   6 +-
 2.0/libdeflate/libdeflate.h                   | 203 +++--
 2.0/plink2.cc                                 |   2 +-
 37 files changed, 1583 insertions(+), 1766 deletions(-)
 delete mode 100644 2.0/libdeflate/lib/crc32_table.h
 delete mode 100644 2.0/libdeflate/lib/crc32_vec_template.h
 delete mode 100644 2.0/libdeflate/lib/unaligned.h

diff --git a/2.0/Tests/TEST_PHASED_VCF/glm_compare.py b/2.0/Tests/TEST_PHASED_VCF/glm_compare.py
index 4c9a32e5a..84ce1857d 100755
--- a/2.0/Tests/TEST_PHASED_VCF/glm_compare.py
+++ b/2.0/Tests/TEST_PHASED_VCF/glm_compare.py
@@ -61,6 +61,7 @@ def main():
         a1_col2 = first_line2.index('A1')
         test_col2 = first_line2.index('TEST')
         obsct_col2 = first_line2.index('OBS_CT')
+        errcode_col2 = first_line2.index('ERRCODE')
         betaor_col2 = -1
         stat_col2 = -1
         is_odds_ratio = False
@@ -102,16 +103,24 @@ def main():
                row1[5] != row2[obsct_col2]:
                 eprint('Header column mismatch between association files.')
                 sys.exit(1)
-            if row1[6] != 'NA' and row2[betaor_col2] != 'NA':
-                val1 = float(row1[6])
-                val2 = float(row2[betaor_col2])
-                if is_odds_ratio:
-                    # this is a more appropriate scale
-                    val1 = math.log(val1)
-                    val2 = math.log(val2)
-                if not float_compare_ok(val1, val2, tol):
-                    eprint('BETA/OR mismatch.')
+            if row1[6] != 'NA':
+                # Apple clang 15.0.0 apparent-miscompilation issue is likely to
+                # manifest as an inappropriate ERRCODE=INVALID_RESULT.  This is
+                # very unlikely to happen on random data, so error out and then
+                # manually pass if necessary.
+                if row2[errcode_col2] == 'INVALID_RESULT':
+                    eprint('Unexpected INVALID_RESULT.')
                     sys.exit(1)
+                if row2[betaor_col2] != 'NA':
+                    val1 = float(row1[6])
+                    val2 = float(row2[betaor_col2])
+                    if is_odds_ratio:
+                        # this is a more appropriate scale
+                        val1 = math.log(val1)
+                        val2 = math.log(val2)
+                    if not float_compare_ok(val1, val2, tol):
+                        eprint('BETA/OR mismatch.')
+                        sys.exit(1)
             if row1[7] != 'NA' and row1[7] != 'inf' and row2[stat_col2] != 'NA':
                 val1 = float(row1[7])
                 val2 = float(row2[stat_col2])
diff --git a/2.0/Tests/TEST_PHASED_VCF/run_tests.sh b/2.0/Tests/TEST_PHASED_VCF/run_tests.sh
index 0d69657f0..ca03ad72b 100755
--- a/2.0/Tests/TEST_PHASED_VCF/run_tests.sh
+++ b/2.0/Tests/TEST_PHASED_VCF/run_tests.sh
@@ -91,7 +91,7 @@ diff -q plink2_pca.eigenvec plink2_pca_rfreq.eigenvec
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pca 3 approx biallelic-var-wts --out plink2_pca_approx
 python3 pca_compare.py -1 plink1_pca -2 plink2_pca_approx -t 0.009
 
-# Test --glm.
+# Test --glm and --clump.
 # Generate random binary and quantitative phenotypes for 1kg_phase3_chr21
 # samples, and verify regression results are consistent with plink 1.9 within a
 # tolerance (tbd: see if we can generate new phenotypes each time, or if it's
@@ -120,9 +120,23 @@ $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm allow-
 python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm.PHENO1.glm.logistic -t 0.1
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm allow-no-covars no-firth --out plink2_glm_dbl
 python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm_dbl.PHENO1.glm.logistic -t 0.3
+plink --bfile plink1_data --clump plink2_glm_dbl.PHENO1.glm.logistic --clump-snp-field ID --clump-p1 0.1 --clump-p2 0.2 --out plink1_test
+# Extract SNP, TOTAL, NSIG, S05, S01, S001, S0001, and SP2 columns.
+# ($12 != "") check needed because plink 1.x puts two blank lines at the end of
+# the .clumped file.
+cat plink1_test.clumped | tail -n +2 | awk '{if ($12 == "NONE") $12 = "."; if ($12 != "") print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink1_test.clump_compare
+plink2 --bfile plink1_data --clump cols=+f plink2_glm_dbl.PHENO1.glm.logistic --clump-p1 0.1 --clump-p2 0.2 --out plink2_test
+cat plink2_test.clumps | tail -n +2 | awk '{print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink2_test.clump_compare
+diff -q plink1_test.clump_compare plink2_test.clump_compare
+
 plink --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --linear --allow-no-sex --out plink1_glm
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm allow-no-covars --out plink2_glm
 python3 glm_compare.py -1 plink1_glm.assoc.linear -2 plink2_glm.PHENO1.glm.linear -t 0.1
+plink --bfile plink1_data --clump plink2_glm.PHENO1.glm.linear --clump-snp-field ID --clump-p1 0.1 --clump-p2 0.2 --out plink1_test
+cat plink1_test.clumped | tail -n +2 | awk '{if ($12 == "NONE") $12 = "."; if ($12 != "") print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink1_test.clump_compare
+plink2 --bfile plink1_data --clump cols=+f plink2_glm.PHENO1.glm.linear --clump-p1 0.1 --clump-p2 0.2 --out plink2_test
+cat plink2_test.clumps | tail -n +2 | awk '{print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink2_test.clump_compare
+diff -q plink1_test.clump_compare plink2_test.clump_compare
 
 plink --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --logistic genotypic --allow-no-sex --out plink1_glm
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm allow-no-covars no-firth single-prec-cc genotypic --out plink2_glm
@@ -132,6 +146,7 @@ python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm_dbl.PHENO1.glm
 plink --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --linear genotypic --allow-no-sex --out plink1_glm
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm allow-no-covars genotypic --out plink2_glm
 python3 glm_compare.py -1 plink1_glm.assoc.linear -2 plink2_glm.PHENO1.glm.linear -t 0.1
+# don't test --clump here since plink 1.x ignores TEST column
 
 plink --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --logistic --covar plink1_pca.eigenvec --allow-no-sex --out plink1_glm
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm no-firth single-prec-cc --covar plink2_pca.eigenvec --out plink2_glm
@@ -141,6 +156,13 @@ python3 glm_compare.py -1 plink1_glm.assoc.logistic -2 plink2_glm_dbl.PHENO1.glm
 plink --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --linear --covar plink1_pca.eigenvec --allow-no-sex --out plink1_glm
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm --covar plink2_pca.eigenvec --out plink2_glm
 python3 glm_compare.py -1 plink1_glm.assoc.linear -2 plink2_glm.PHENO1.glm.linear -t 0.1
+# hide-covar allows --clump test here
+$1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_qt.txt --glm hide-covar --covar plink2_pca.eigenvec --out plink2_glm
+plink --bfile plink1_data --clump plink2_glm.PHENO1.glm.linear --clump-snp-field ID --clump-p1 0.1 --clump-p2 0.2 --out plink1_test
+cat plink1_test.clumped | tail -n +2 | awk '{if ($12 == "NONE") $12 = "."; if ($12 != "") print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink1_test.clump_compare
+plink2 --bfile plink1_data --clump cols=+f plink2_glm.PHENO1.glm.linear --clump-p1 0.1 --clump-p2 0.2 --out plink2_test
+cat plink2_test.clumps | tail -n +2 | awk '{print $3"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12}' > plink2_test.clump_compare
+diff -q plink1_test.clump_compare plink2_test.clump_compare
 
 plink --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --logistic genotypic --covar plink1_pca.eigenvec --allow-no-sex --out plink1_glm
 $1/plink2 $2 $3 --bfile plink1_data --maf 0.02 --pheno pheno_cc.txt --glm no-firth single-prec-cc genotypic --covar plink2_pca.eigenvec --out plink2_glm
diff --git a/2.0/libdeflate/common_defs.h b/2.0/libdeflate/common_defs.h
index cfe6fd62b..e1bc3fe09 100644
--- a/2.0/libdeflate/common_defs.h
+++ b/2.0/libdeflate/common_defs.h
@@ -28,16 +28,65 @@
 #ifndef COMMON_DEFS_H
 #define COMMON_DEFS_H
 
+#include "libdeflate.h"
+
 #include <stdbool.h>
 #include <stddef.h>	/* for size_t */
 #include <stdint.h>
 #ifdef _MSC_VER
+#  include <intrin.h>	/* for _BitScan*() and other intrinsics */
 #  include <stdlib.h>	/* for _byteswap_*() */
+   /* Disable MSVC warnings that are expected. */
+   /* /W2 */
+#  pragma warning(disable : 4146) /* unary minus on unsigned type */
+   /* /W3 */
+#  pragma warning(disable : 4018) /* signed/unsigned mismatch */
+#  pragma warning(disable : 4244) /* possible loss of data */
+#  pragma warning(disable : 4267) /* possible loss of precision */
+#  pragma warning(disable : 4310) /* cast truncates constant value */
+   /* /W4 */
+#  pragma warning(disable : 4100) /* unreferenced formal parameter */
+#  pragma warning(disable : 4127) /* conditional expression is constant */
+#  pragma warning(disable : 4189) /* local variable initialized but not referenced */
+#  pragma warning(disable : 4232) /* nonstandard extension used */
+#  pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */
+#  pragma warning(disable : 4295) /* array too small to include terminating null */
 #endif
 #ifndef FREESTANDING
 #  include <string.h>	/* for memcpy() */
 #endif
 
+/* ========================================================================== */
+/*                             Target architecture                            */
+/* ========================================================================== */
+
+/* If possible, define a compiler-independent ARCH_* macro. */
+#undef ARCH_X86_64
+#undef ARCH_X86_32
+#undef ARCH_ARM64
+#undef ARCH_ARM32
+#ifdef _MSC_VER
+#  if defined(_M_X64)
+#    define ARCH_X86_64
+#  elif defined(_M_IX86)
+#    define ARCH_X86_32
+#  elif defined(_M_ARM64)
+#    define ARCH_ARM64
+#  elif defined(_M_ARM)
+#    define ARCH_ARM32
+#  endif
+#else
+#  if defined(__x86_64__)
+#    define ARCH_X86_64
+#  elif defined(__i386__)
+#    define ARCH_X86_32
+#  elif defined(__aarch64__)
+#    define ARCH_ARM64
+#  elif defined(__arm__)
+#    define ARCH_ARM32
+#  endif
+#endif
+
 /* ========================================================================== */
 /*                              Type definitions                              */
 /* ========================================================================== */
@@ -111,22 +160,13 @@ typedef size_t machine_word_t;
 #  define __has_builtin(builtin)	0
 #endif
 
-/* LIBEXPORT - export a function from a shared library */
-#ifdef _WIN32
-#  define LIBEXPORT		__declspec(dllexport)
-#elif defined(__GNUC__)
-#  define LIBEXPORT		__attribute__((visibility("default")))
-#else
-#  define LIBEXPORT
-#endif
-
 /* inline - suggest that a function be inlined */
 #ifdef _MSC_VER
 #  define inline		__inline
 #endif /* else assume 'inline' is usable as-is */
 
 /* forceinline - force a function to be inlined, if possible */
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_attribute(always_inline)
 #  define forceinline		inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
 #  define forceinline		__forceinline
@@ -135,54 +175,71 @@ typedef size_t machine_word_t;
 #endif
 
 /* MAYBE_UNUSED - mark a function or variable as maybe unused */
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_attribute(unused)
 #  define MAYBE_UNUSED		__attribute__((unused))
 #else
 #  define MAYBE_UNUSED
 #endif
 
-/* restrict - hint that writes only occur through the given pointer */
-#ifdef __GNUC__
-#  define restrict		__restrict__
-#elif defined(_MSC_VER)
-    /*
-     * Don't use MSVC's __restrict; it has nonstandard behavior.
-     * Standard restrict is okay, if it is supported.
-     */
-#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-#    define restrict		restrict
+/*
+ * restrict - hint that writes only occur through the given pointer.
+ *
+ * Don't use MSVC's __restrict, since it has nonstandard behavior.
+ * Standard restrict is okay, if it is supported.
+ */
+#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define restrict		__restrict__
 #  else
 #    define restrict
 #  endif
-#else
-#  define restrict
-#endif
+#endif /* else assume 'restrict' is usable as-is */
 
 /* likely(expr) - hint that an expression is usually true */
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
 #  define likely(expr)		__builtin_expect(!!(expr), 1)
 #else
 #  define likely(expr)		(expr)
 #endif
 
 /* unlikely(expr) - hint that an expression is usually false */
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
 #  define unlikely(expr)	__builtin_expect(!!(expr), 0)
 #else
 #  define unlikely(expr)	(expr)
 #endif
 
 /* prefetchr(addr) - prefetch into L1 cache for read */
-#ifdef __GNUC__
+#undef prefetchr
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
 #  define prefetchr(addr)	__builtin_prefetch((addr), 0)
-#else
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchr(addr)	_mm_prefetch((addr), _MM_HINT_T0)
+#  elif defined(ARCH_ARM64)
+#    define prefetchr(addr)	__prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchr(addr)	__prefetch(addr)
+#  endif
+#endif
+#ifndef prefetchr
 #  define prefetchr(addr)
 #endif
 
 /* prefetchw(addr) - prefetch into L1 cache for write */
-#ifdef __GNUC__
+#undef prefetchw
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
 #  define prefetchw(addr)	__builtin_prefetch((addr), 1)
-#else
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchw(addr)	_m_prefetchw(addr)
+#  elif defined(ARCH_ARM64)
+#    define prefetchw(addr)	__prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchw(addr)	__prefetchw(addr)
+#  endif
+#endif
+#ifndef prefetchw
 #  define prefetchw(addr)
 #endif
 
@@ -191,13 +248,28 @@ typedef size_t machine_word_t;
  * the annotated type, must be aligned on n-byte boundaries.
  */
 #undef _aligned_attribute
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_attribute(aligned)
 #  define _aligned_attribute(n)	__attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#  define _aligned_attribute(n)	__declspec(align(n))
 #endif
 
-/* Does the compiler support the 'target' function attribute? */
-#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \
-	(GCC_PREREQ(4, 4) || __has_attribute(target))
+/*
+ * _target_attribute(attrs) - override the compilation target for a function.
+ *
+ * This accepts one or more comma-separated suffixes to the -m prefix jointly
+ * forming the name of a machine-dependent option.  On gcc-like compilers, this
+ * enables codegen for the given targets, including arbitrary compiler-generated
+ * code as well as the corresponding intrinsics.  On other compilers this macro
+ * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway.
+ */
+#if GCC_PREREQ(4, 4) || __has_attribute(target)
+#  define _target_attribute(attrs)	__attribute__((target(attrs)))
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	1
+#else
+#  define _target_attribute(attrs)
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	0
+#endif
 
 /* ========================================================================== */
 /*                          Miscellaneous macros                              */
@@ -299,8 +371,8 @@ static forceinline u64 bswap64(u64 v)
  * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed
  * efficiently on the target platform, otherwise 0.
  */
-#if defined(__GNUC__) && \
-	(defined(__x86_64__) || defined(__i386__) || \
+#if (defined(__GNUC__) || defined(__clang__)) && \
+	(defined(ARCH_X86_64) || defined(ARCH_X86_32) || \
 	 defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
 	 /*
 	  * For all compilation purposes, WebAssembly behaves like any other CPU
@@ -520,7 +592,7 @@ put_unaligned_leword(machine_word_t v, u8 *p)
 static forceinline unsigned
 bsr32(u32 v)
 {
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_builtin(__builtin_clz)
 	return 31 - __builtin_clz(v);
 #elif defined(_MSC_VER)
 	unsigned long i;
@@ -539,7 +611,7 @@ bsr32(u32 v)
 static forceinline unsigned
 bsr64(u64 v)
 {
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_builtin(__builtin_clzll)
 	return 63 - __builtin_clzll(v);
 #elif defined(_MSC_VER) && defined(_WIN64)
 	unsigned long i;
@@ -574,7 +646,7 @@ bsrw(machine_word_t v)
 static forceinline unsigned
 bsf32(u32 v)
 {
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_builtin(__builtin_ctz)
 	return __builtin_ctz(v);
 #elif defined(_MSC_VER)
 	unsigned long i;
@@ -593,7 +665,7 @@ bsf32(u32 v)
 static forceinline unsigned
 bsf64(u64 v)
 {
-#ifdef __GNUC__
+#if defined(__GNUC__) || __has_builtin(__builtin_ctzll)
 	return __builtin_ctzll(v);
 #elif defined(_MSC_VER) && defined(_WIN64)
 	unsigned long i;
@@ -624,7 +696,7 @@ bsfw(machine_word_t v)
  * fallback implementation; use '#ifdef rbit32' to check if this is available.
  */
 #undef rbit32
-#if defined(__GNUC__) && defined(__arm__) && \
+#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \
 	(__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__)))
 static forceinline u32
 rbit32(u32 v)
@@ -633,7 +705,7 @@ rbit32(u32 v)
 	return v;
 }
 #define rbit32 rbit32
-#elif defined(__GNUC__) && defined(__aarch64__)
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64)
 static forceinline u32
 rbit32(u32 v)
 {
diff --git a/2.0/libdeflate/lib/adler32.c b/2.0/libdeflate/lib/adler32.c
index a7f4a3391..3a526ac9d 100644
--- a/2.0/libdeflate/lib/adler32.c
+++ b/2.0/libdeflate/lib/adler32.c
@@ -26,7 +26,6 @@
  */
 
 #include "lib_common.h"
-#include "libdeflate.h"
 
 /* The Adler-32 divisor, or "base", value */
 #define DIVISOR 65521
@@ -91,9 +90,9 @@ adler32_generic(u32 adler, const u8 *p, size_t len)
 #undef DEFAULT_IMPL
 #undef arch_select_adler32_func
 typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
 #  include "arm/adler32_impl.h"
-#elif defined(__i386__) || defined(__x86_64__)
+#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #  include "x86/adler32_impl.h"
 #endif
 
@@ -122,7 +121,7 @@ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
 #define adler32_impl DEFAULT_IMPL
 #endif
 
-LIBDEFLATEEXPORT u32 LIBDEFLATEAPI
+LIBDEFLATEAPI u32
 libdeflate_adler32(u32 adler, const void *buffer, size_t len)
 {
 	if (buffer == NULL) /* Return initial value. */
diff --git a/2.0/libdeflate/lib/arm/adler32_impl.h b/2.0/libdeflate/lib/arm/adler32_impl.h
index e0589fe14..4083b2ef3 100644
--- a/2.0/libdeflate/lib/arm/adler32_impl.h
+++ b/2.0/libdeflate/lib/arm/adler32_impl.h
@@ -42,10 +42,10 @@
 #  if HAVE_NEON_NATIVE
 #    define ATTRIBUTES
 #  else
-#    ifdef __arm__
-#      define ATTRIBUTES	__attribute__((target("fpu=neon")))
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES	_target_attribute("fpu=neon")
 #    else
-#      define ATTRIBUTES	__attribute__((target("+simd")))
+#      define ATTRIBUTES	_target_attribute("+simd")
 #    endif
 #  endif
 #  include <arm_neon.h>
@@ -53,29 +53,35 @@ static forceinline ATTRIBUTES void
 adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 		   u32 *s1, u32 *s2)
 {
-	const uint16x8_t mults_a = { 64, 63, 62, 61, 60, 59, 58, 57, };
-	const uint16x8_t mults_b = { 56, 55, 54, 53, 52, 51, 50, 49, };
-	const uint16x8_t mults_c = { 48, 47, 46, 45, 44, 43, 42, 41, };
-	const uint16x8_t mults_d = { 40, 39, 38, 37, 36, 35, 34, 33, };
-	const uint16x8_t mults_e = { 32, 31, 30, 29, 28, 27, 26, 25, };
-	const uint16x8_t mults_f = { 24, 23, 22, 21, 20, 19, 18, 17, };
-	const uint16x8_t mults_g = { 16, 15, 14, 13, 12, 11, 10,  9, };
-	const uint16x8_t mults_h = {  8,  7,  6,  5,  4,  3,  2,  1, };
+	static const u16 _aligned_attribute(16) mults[64] = {
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const uint16x8_t mults_a = vld1q_u16(&mults[0]);
+	const uint16x8_t mults_b = vld1q_u16(&mults[8]);
+	const uint16x8_t mults_c = vld1q_u16(&mults[16]);
+	const uint16x8_t mults_d = vld1q_u16(&mults[24]);
+	const uint16x8_t mults_e = vld1q_u16(&mults[32]);
+	const uint16x8_t mults_f = vld1q_u16(&mults[40]);
+	const uint16x8_t mults_g = vld1q_u16(&mults[48]);
+	const uint16x8_t mults_h = vld1q_u16(&mults[56]);
 
-	uint32x4_t v_s1 = { 0, 0, 0, 0 };
-	uint32x4_t v_s2 = { 0, 0, 0, 0 };
+	uint32x4_t v_s1 = vdupq_n_u32(0);
+	uint32x4_t v_s2 = vdupq_n_u32(0);
 	/*
 	 * v_byte_sums_* contain the sum of the bytes at index i across all
 	 * 64-byte segments, for each index 0..63.
 	 */
-	uint16x8_t v_byte_sums_a = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_b = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_c = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_d = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_e = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_f = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_g = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_h = { 0, 0, 0, 0, 0, 0, 0, 0 };
+	uint16x8_t v_byte_sums_a = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_b = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_c = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_d = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_e = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_f = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_g = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_h = vdupq_n_u16(0);
 
 	do {
 		/* Load the next 64 bytes. */
@@ -89,7 +95,7 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 		 * Accumulate the previous s1 counters into the s2 counters.
 		 * The needed multiplication by 64 is delayed to later.
 		 */
-		v_s2 += v_s1;
+		v_s2 = vaddq_u32(v_s2, v_s1);
 
 		/*
 		 * Add the 64 bytes to their corresponding v_byte_sums counters,
@@ -113,7 +119,7 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 	} while (p != end);
 
 	/* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */
-#ifdef __arm__
+#ifdef ARCH_ARM32
 #  define umlal2(a, b, c)  vmlal_u16((a), vget_high_u16(b), vget_high_u16(c))
 #else
 #  define umlal2	   vmlal_high_u16
@@ -138,8 +144,15 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 #undef umlal2
 
 	/* Horizontal sum to finish up */
-	*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
-	*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
+#ifdef ARCH_ARM32
+	*s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) +
+	       vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3);
+	*s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) +
+	       vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3);
+#else
+	*s1 += vaddvq_u32(v_s1);
+	*s2 += vaddvq_u32(v_s2);
+#endif
 }
 #  include "../adler32_vec_template.h"
 #endif /* Regular NEON implementation */
@@ -156,16 +169,16 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 #    define ATTRIBUTES
 #  else
 #    ifdef __clang__
-#      define ATTRIBUTES  __attribute__((target("dotprod")))
+#      define ATTRIBUTES  _target_attribute("dotprod")
      /*
       * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
       * default target is armv8.3-a or later in which case it must be omitted.
       * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
       */
 #    elif defined(__ARM_FEATURE_JCVT)
-#      define ATTRIBUTES  __attribute__((target("+dotprod")))
+#      define ATTRIBUTES  _target_attribute("+dotprod")
 #    else
-#      define ATTRIBUTES  __attribute__((target("arch=armv8.2-a+dotprod")))
+#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+dotprod")
 #    endif
 #  endif
 #  include <arm_neon.h>
@@ -173,35 +186,32 @@ static forceinline ATTRIBUTES void
 adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 			   u32 *s1, u32 *s2)
 {
-	const uint8x16_t mults_a = {
+	static const u8 _aligned_attribute(16) mults[64] = {
 		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
-	};
-	const uint8x16_t mults_b = {
 		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
-	};
-	const uint8x16_t mults_c = {
 		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-	};
-	const uint8x16_t mults_d = {
 		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
 	};
-	const uint8x16_t ones = {
-		 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 , 1,  1,
-	};
-	uint32x4_t v_s1_a = { 0, 0, 0, 0 };
-	uint32x4_t v_s1_b = { 0, 0, 0, 0 };
-	uint32x4_t v_s1_c = { 0, 0, 0, 0 };
-	uint32x4_t v_s1_d = { 0, 0, 0, 0 };
-	uint32x4_t v_s2_a = { 0, 0, 0, 0 };
-	uint32x4_t v_s2_b = { 0, 0, 0, 0 };
-	uint32x4_t v_s2_c = { 0, 0, 0, 0 };
-	uint32x4_t v_s2_d = { 0, 0, 0, 0 };
-	uint32x4_t v_s1_sums_a = { 0, 0, 0, 0 };
-	uint32x4_t v_s1_sums_b = { 0, 0, 0, 0 };
-	uint32x4_t v_s1_sums_c = { 0, 0, 0, 0 };
-	uint32x4_t v_s1_sums_d = { 0, 0, 0, 0 };
+	const uint8x16_t mults_a = vld1q_u8(&mults[0]);
+	const uint8x16_t mults_b = vld1q_u8(&mults[16]);
+	const uint8x16_t mults_c = vld1q_u8(&mults[32]);
+	const uint8x16_t mults_d = vld1q_u8(&mults[48]);
+	const uint8x16_t ones = vdupq_n_u8(1);
+	uint32x4_t v_s1_a = vdupq_n_u32(0);
+	uint32x4_t v_s1_b = vdupq_n_u32(0);
+	uint32x4_t v_s1_c = vdupq_n_u32(0);
+	uint32x4_t v_s1_d = vdupq_n_u32(0);
+	uint32x4_t v_s2_a = vdupq_n_u32(0);
+	uint32x4_t v_s2_b = vdupq_n_u32(0);
+	uint32x4_t v_s2_c = vdupq_n_u32(0);
+	uint32x4_t v_s2_d = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_a = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_b = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_c = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_d = vdupq_n_u32(0);
 	uint32x4_t v_s1;
 	uint32x4_t v_s2;
+	uint32x4_t v_s1_sums;
 
 	do {
 		uint8x16_t bytes_a = *p++;
@@ -209,29 +219,31 @@ adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 		uint8x16_t bytes_c = *p++;
 		uint8x16_t bytes_d = *p++;
 
-		v_s1_sums_a += v_s1_a;
+		v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a);
 		v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones);
 		v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a);
 
-		v_s1_sums_b += v_s1_b;
+		v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b);
 		v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones);
 		v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b);
 
-		v_s1_sums_c += v_s1_c;
+		v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c);
 		v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones);
 		v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c);
 
-		v_s1_sums_d += v_s1_d;
+		v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d);
 		v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones);
 		v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d);
 	} while (p != end);
 
-	v_s1 = v_s1_a + v_s1_b + v_s1_c + v_s1_d;
-	v_s2 = v_s2_a + v_s2_b + v_s2_c + v_s2_d +
-	       vqshlq_n_u32(v_s1_sums_a + v_s1_sums_b +
-			    v_s1_sums_c + v_s1_sums_d, 6);
-	*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
-	*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
+	v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), vaddq_u32(v_s1_c, v_s1_d));
+	v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), vaddq_u32(v_s2_c, v_s2_d));
+	v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, v_s1_sums_b),
+			      vaddq_u32(v_s1_sums_c, v_s1_sums_d));
+	v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6));
+
+	*s1 += vaddvq_u32(v_s1);
+	*s2 += vaddvq_u32(v_s2);
 }
 #  include "../adler32_vec_template.h"
 #endif /* NEON+dotprod implementation */
diff --git a/2.0/libdeflate/lib/arm/arm_cpu_features.c b/2.0/libdeflate/lib/arm/arm_cpu_features.c
index 98f881dec..72ab03da3 100644
--- a/2.0/libdeflate/lib/arm/arm_cpu_features.c
+++ b/2.0/libdeflate/lib/arm/arm_cpu_features.c
@@ -31,8 +31,9 @@
  */
 
 #ifdef __APPLE__
-#undef _ANSI_SOURCE
-#define _DARWIN_C_SOURCE /* for sysctlbyname() */
+#  undef _ANSI_SOURCE
+#  undef _DARWIN_C_SOURCE
+#  define _DARWIN_C_SOURCE /* for sysctlbyname() */
 #endif
 
 #include "../cpu_features_common.h" /* must be included first */
@@ -79,7 +80,7 @@ static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
 				goto out;
 			}
 			filled += ret;
-		} while (filled < (long)(2 * sizeof(long)));
+		} while (filled < 2 * sizeof(long));
 
 		i = 0;
 		do {
@@ -92,7 +93,7 @@ static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
 				*hwcap2 = value;
 			i += 2;
 			filled -= 2 * sizeof(long);
-		} while (filled >= (long)(2 * sizeof(long)));
+		} while (filled >= 2 * sizeof(long));
 
 		memmove(auxbuf, &auxbuf[i], filled);
 	}
@@ -108,7 +109,7 @@ static u32 query_arm_cpu_features(void)
 
 	scan_auxv(&hwcap, &hwcap2);
 
-#ifdef __arm__
+#ifdef ARCH_ARM32
 	STATIC_ASSERT(sizeof(long) == 4);
 	if (hwcap & (1 << 12))	/* HWCAP_NEON */
 		features |= ARM_CPU_FEATURE_NEON;
@@ -167,6 +168,23 @@ static u32 query_arm_cpu_features(void)
 	}
 	return features;
 }
+#elif defined(_WIN32)
+
+#include <windows.h>
+
+static u32 query_arm_cpu_features(void)
+{
+	u32 features = ARM_CPU_FEATURE_NEON;
+
+	if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
+		features |= ARM_CPU_FEATURE_PMULL;
+	if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
+		features |= ARM_CPU_FEATURE_CRC32;
+
+	/* FIXME: detect SHA3 and DOTPROD support too. */
+
+	return features;
+}
 #else
 #error "unhandled case"
 #endif
diff --git a/2.0/libdeflate/lib/arm/cpu_features.h b/2.0/libdeflate/lib/arm/cpu_features.h
index c6dfe86d3..c55f007cf 100644
--- a/2.0/libdeflate/lib/arm/cpu_features.h
+++ b/2.0/libdeflate/lib/arm/cpu_features.h
@@ -32,12 +32,13 @@
 
 #define HAVE_DYNAMIC_ARM_CPU_FEATURES	0
 
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
 
-#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \
-	!defined(FREESTANDING) && \
-	(defined(__linux__) || \
-	 (defined(__aarch64__) && defined(__APPLE__)))
+#if !defined(FREESTANDING) && \
+    (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \
+    (defined(__linux__) || \
+     (defined(__APPLE__) && defined(ARCH_ARM64)) || \
+     (defined(_WIN32) && defined(ARCH_ARM64)))
 #  undef HAVE_DYNAMIC_ARM_CPU_FEATURES
 #  define HAVE_DYNAMIC_ARM_CPU_FEATURES	1
 #endif
@@ -71,19 +72,18 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */
 
 /* NEON */
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON) || defined(ARCH_ARM64)
 #  define HAVE_NEON_NATIVE	1
 #else
 #  define HAVE_NEON_NATIVE	0
 #endif
-#define HAVE_NEON_TARGET	HAVE_DYNAMIC_ARM_CPU_FEATURES
 /*
  * With both gcc and clang, NEON intrinsics require that the main target has
  * NEON enabled already.  Exception: with gcc 6.1 and later (r230411 for arm32,
  * r226563 for arm64), hardware floating point support is sufficient.
  */
 #if HAVE_NEON_NATIVE || \
-	(HAVE_NEON_TARGET && GCC_PREREQ(6, 1) && defined(__ARM_FP))
+	(HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP))
 #  define HAVE_NEON_INTRIN	1
 #else
 #  define HAVE_NEON_INTRIN	0
@@ -95,20 +95,50 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #else
 #  define HAVE_PMULL_NATIVE	0
 #endif
-#define HAVE_PMULL_TARGET \
+#if HAVE_PMULL_NATIVE || \
 	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
-	 (GCC_PREREQ(6, 1) || __has_builtin(__builtin_neon_vmull_p64)))
+	 HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \
+	 (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \
+	  defined(_MSC_VER)) && \
+	  /*
+	   * On arm32 with clang, the crypto intrinsics (which include pmull)
+	   * are not defined, even when using -mfpu=crypto-neon-fp-armv8,
+	   * because clang's <arm_neon.h> puts their definitions behind
+	   * __aarch64__.
+	   */ \
+	 !(defined(ARCH_ARM32) && defined(__clang__)))
+#  define HAVE_PMULL_INTRIN	CPU_IS_LITTLE_ENDIAN() /* untested on big endian */
+   /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */
+#  ifdef _MSC_VER
+#    define compat_vmull_p64(a, b)  vmull_p64(vcreate_p64(a), vcreate_p64(b))
+#  else
+#    define compat_vmull_p64(a, b)  vmull_p64((a), (b))
+#  endif
+#else
+#  define HAVE_PMULL_INTRIN	0
+#endif
 /*
- * On arm32 with clang, the crypto intrinsics (which include pmull) are not
- * defined, even when using -mfpu=crypto-neon-fp-armv8, because clang's
- * <arm_neon.h> puts their definitions behind __aarch64__.
+ * Set USE_PMULL_TARGET_EVEN_IF_NATIVE if a workaround for a gcc bug that was
+ * fixed by commit 11a113d501ff ("aarch64: Simplify feature definitions") in gcc
+ * 13 is needed.  A minimal program that fails to build due to this bug when
+ * compiled with -mcpu=emag, at least with gcc 10 through 12, is:
+ *
+ *    static inline __attribute__((always_inline,target("+crypto"))) void f() {}
+ *    void g() { f(); }
+ *
+ * The error is:
+ *
+ *    error: inlining failed in call to ‘always_inline’ ‘f’: target specific option mismatch
+ *
+ * The workaround is to explicitly add the crypto target to the non-inline
+ * function g(), even though this should not be required due to -mcpu=emag
+ * enabling 'crypto' natively and causing __ARM_FEATURE_CRYPTO to be defined.
  */
-#if HAVE_NEON_INTRIN && (HAVE_PMULL_NATIVE || HAVE_PMULL_TARGET) && \
-	!(defined(__arm__) && defined(__clang__)) && \
-	CPU_IS_LITTLE_ENDIAN() /* pmull code on big endian is untested */
-#  define HAVE_PMULL_INTRIN	1
+#if HAVE_PMULL_NATIVE && defined(ARCH_ARM64) && \
+		GCC_PREREQ(6, 1) && !GCC_PREREQ(13, 1)
+#  define USE_PMULL_TARGET_EVEN_IF_NATIVE	1
 #else
-#  define HAVE_PMULL_INTRIN	0
+#  define USE_PMULL_TARGET_EVEN_IF_NATIVE	0
 #endif
 
 /* CRC32 */
@@ -117,30 +147,50 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #else
 #  define HAVE_CRC32_NATIVE	0
 #endif
-#define HAVE_CRC32_TARGET \
-	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
-	 (GCC_PREREQ(4, 9) || __has_builtin(__builtin_arm_crc32b)))
-/*
- * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled in
- * the main target has been affected by two gcc bugs, which we must avoid by
- * only allowing gcc versions that have the corresponding fixes.  First, gcc
- * commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a and
- * hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed.  Second, gcc
- * commit c1cdabe3aab8 ("arm: reorder assembler architecture directives
- * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when binutils is
- * 2.34 or later, due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439.
- * We use the second set of prerequisites, as they are stricter and we have no
- * way to detect the binutils version directly from a C source file.
- */
-#define HAVE_CRC32_INTRIN \
-	(HAVE_CRC32_NATIVE || (HAVE_CRC32_TARGET && \
-			       (!GCC_PREREQ(1, 0) || \
-				GCC_PREREQ(11, 3) || \
-				(GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \
-				(GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0)))))
+#undef HAVE_CRC32_INTRIN
+#if HAVE_CRC32_NATIVE
+#  define HAVE_CRC32_INTRIN	1
+#elif HAVE_DYNAMIC_ARM_CPU_FEATURES
+#  if GCC_PREREQ(1, 0)
+    /*
+     * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled
+     * in the main target has been affected by two gcc bugs, which we must avoid
+     * by only allowing gcc versions that have the corresponding fixes.  First,
+     * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a
+     * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed.  Second,
+     * gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture directives
+     * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when
+     * binutils is 2.34 or later, due to
+     * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439.  We use the second
+     * set of prerequisites, as they are stricter and we have no way to detect
+     * the binutils version directly from a C source file.
+     *
+     * Also exclude the cases where the main target arch is armv6kz or armv7e-m.
+     * In those cases, gcc doesn't let functions that use the main arch be
+     * inlined into functions that are targeted to armv8-a+crc.  (armv8-a is
+     * necessary for crc to be accepted at all.)  That causes build errors.
+     * This issue happens for these specific sub-archs because they are not a
+     * subset of armv8-a.  Note: clang does not have this limitation.
+     */
+#    if (GCC_PREREQ(11, 3) || \
+	 (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \
+	 (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))) && \
+	!defined(__ARM_ARCH_6KZ__) && \
+	!defined(__ARM_ARCH_7EM__)
+#      define HAVE_CRC32_INTRIN	1
+#    endif
+#  elif CLANG_PREREQ(3, 4, 6000000)
+#    define HAVE_CRC32_INTRIN	1
+#  elif defined(_MSC_VER)
+#    define HAVE_CRC32_INTRIN	1
+#  endif
+#endif
+#ifndef HAVE_CRC32_INTRIN
+#  define HAVE_CRC32_INTRIN	0
+#endif
 
 /* SHA3 (needed for the eor3 instruction) */
-#ifdef __aarch64__
+#if defined(ARCH_ARM64) && !defined(_MSC_VER)
 #  ifdef __ARM_FEATURE_SHA3
 #    define HAVE_SHA3_NATIVE	1
 #  else
@@ -149,9 +199,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #  define HAVE_SHA3_TARGET	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
 				 (GCC_PREREQ(8, 1) /* r256478 */ || \
 				  CLANG_PREREQ(7, 0, 10010463) /* r338010 */))
-#  define HAVE_SHA3_INTRIN	((HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \
+#  define HAVE_SHA3_INTRIN	(HAVE_NEON_INTRIN && \
+				 (HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \
 				 (GCC_PREREQ(9, 1) /* r268049 */ || \
-				  __has_builtin(__builtin_neon_veor3q_v)))
+				  CLANG_PREREQ(13, 0, 13160000)))
 #else
 #  define HAVE_SHA3_NATIVE	0
 #  define HAVE_SHA3_TARGET	0
@@ -159,20 +210,22 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #endif
 
 /* dotprod */
-#ifdef __aarch64__
+#ifdef ARCH_ARM64
 #  ifdef __ARM_FEATURE_DOTPROD
 #    define HAVE_DOTPROD_NATIVE	1
 #  else
 #    define HAVE_DOTPROD_NATIVE	0
 #  endif
-#  define HAVE_DOTPROD_TARGET \
+#  if HAVE_DOTPROD_NATIVE || \
 	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
-	 (GCC_PREREQ(8, 1) || __has_builtin(__builtin_neon_vdotq_v)))
-#  define HAVE_DOTPROD_INTRIN \
-	(HAVE_NEON_INTRIN && (HAVE_DOTPROD_NATIVE || HAVE_DOTPROD_TARGET))
+	 (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || \
+	  defined(_MSC_VER)))
+#    define HAVE_DOTPROD_INTRIN	1
+#  else
+#    define HAVE_DOTPROD_INTRIN	0
+#  endif
 #else
 #  define HAVE_DOTPROD_NATIVE	0
-#  define HAVE_DOTPROD_TARGET	0
 #  define HAVE_DOTPROD_INTRIN	0
 #endif
 
@@ -184,7 +237,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
  * corresponding __ARM_FEATURE_* macros while including the headers.
  */
 #if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
-	(defined(__clang__) || defined(__arm__))
+	(defined(__clang__) || defined(ARCH_ARM32))
 #  define __ARM_FEATURE_CRC32	1
 #endif
 #if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
@@ -194,7 +247,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #  define __ARM_FEATURE_DOTPROD	1
 #endif
 #if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
-	(defined(__clang__) || defined(__arm__))
+	(defined(__clang__) || defined(ARCH_ARM32))
 #  include <arm_acle.h>
 #  undef __ARM_FEATURE_CRC32
 #endif
@@ -207,6 +260,6 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #  undef __ARM_FEATURE_DOTPROD
 #endif
 
-#endif /* __arm__ || __aarch64__ */
+#endif /* ARCH_ARM32 || ARCH_ARM64 */
 
 #endif /* LIB_ARM_CPU_FEATURES_H */
diff --git a/2.0/libdeflate/lib/arm/crc32_impl.h b/2.0/libdeflate/lib/arm/crc32_impl.h
index 0db6354d9..c802cdf03 100644
--- a/2.0/libdeflate/lib/arm/crc32_impl.h
+++ b/2.0/libdeflate/lib/arm/crc32_impl.h
@@ -48,22 +48,31 @@
 #  if HAVE_CRC32_NATIVE
 #    define ATTRIBUTES
 #  else
-#    ifdef __arm__
+#    ifdef ARCH_ARM32
 #      ifdef __clang__
-#        define ATTRIBUTES	__attribute__((target("armv8-a,crc")))
+#        define ATTRIBUTES	_target_attribute("armv8-a,crc")
+#      elif defined(__ARM_PCS_VFP)
+	 /*
+	  * +simd is needed to avoid a "selected architecture lacks an FPU"
+	  * error with Debian arm-linux-gnueabihf-gcc when -mfpu is not
+	  * explicitly specified on the command line.
+	  */
+#        define ATTRIBUTES	_target_attribute("arch=armv8-a+crc+simd")
 #      else
-#        define ATTRIBUTES	__attribute__((target("arch=armv8-a+crc")))
+#        define ATTRIBUTES	_target_attribute("arch=armv8-a+crc")
 #      endif
 #    else
 #      ifdef __clang__
-#        define ATTRIBUTES	__attribute__((target("crc")))
+#        define ATTRIBUTES	_target_attribute("crc")
 #      else
-#        define ATTRIBUTES	__attribute__((target("+crc")))
+#        define ATTRIBUTES	_target_attribute("+crc")
 #      endif
 #    endif
 #  endif
 
-#include <arm_acle.h>
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
 
 /*
  * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN
@@ -233,26 +242,36 @@ crc32_arm_crc(u32 crc, const u8 *p, size_t len)
  * checksummed chunks, not for folding the data itself.  See crc32_arm_pmull*()
  * for implementations that use pmull for folding the data itself.
  */
-#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN && \
-	((HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE) || \
-	 (HAVE_CRC32_TARGET && HAVE_PMULL_TARGET))
-#  if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE
+#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN
+#  if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
 #    define ATTRIBUTES
 #  else
-#    ifdef __arm__
-#      define ATTRIBUTES	__attribute__((target("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8")))
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES	_target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8")
 #    else
 #      ifdef __clang__
-#        define ATTRIBUTES	__attribute__((target("crc,crypto")))
+#        define ATTRIBUTES	_target_attribute("crc,aes")
 #      else
-#        define ATTRIBUTES	__attribute__((target("+crc,+crypto")))
+#        define ATTRIBUTES	_target_attribute("+crc,+crypto")
 #      endif
 #    endif
 #  endif
 
-#include <arm_acle.h>
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
 #include <arm_neon.h>
 
+/* Do carryless multiplication of two 32-bit values. */
+static forceinline ATTRIBUTES u64
+clmul_u32(u32 a, u32 b)
+{
+	uint64x2_t res = vreinterpretq_u64_p128(
+				compat_vmull_p64((poly64_t)a, (poly64_t)b));
+
+	return vgetq_lane_u64(res, 0);
+}
+
 /*
  * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more
  * quickly, and supports a variable chunk length.  The chunk length is
@@ -262,9 +281,9 @@ crc32_arm_crc(u32 crc, const u8 *p, size_t len)
 static forceinline ATTRIBUTES u32
 combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
 {
-	u64 res0 = vmull_p64(crc0, crc32_mults_for_chunklen[i][0]);
-	u64 res1 = vmull_p64(crc1, crc32_mults_for_chunklen[i][1]);
-	u64 res2 = vmull_p64(crc2, crc32_mults_for_chunklen[i][2]);
+	u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]);
+	u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]);
+	u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]);
 
 	return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
 }
@@ -426,16 +445,25 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 #if HAVE_PMULL_INTRIN
 #  define crc32_arm_pmullx4	crc32_arm_pmullx4
 #  define SUFFIX			 _pmullx4
-#  if HAVE_PMULL_NATIVE
+#  if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
 #    define ATTRIBUTES
 #  else
-#    ifdef __arm__
-#      define ATTRIBUTES    __attribute__((target("fpu=crypto-neon-fp-armv8")))
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES    _target_attribute("fpu=crypto-neon-fp-armv8")
 #    else
 #      ifdef __clang__
-#        define ATTRIBUTES  __attribute__((target("crypto")))
+	 /*
+	  * This used to use "crypto", but that stopped working with clang 16.
+	  * Now only "aes" works.  "aes" works with older versions too, so use
+	  * that.  No "+" prefix; clang 15 and earlier doesn't accept that.
+	  */
+#        define ATTRIBUTES  _target_attribute("aes")
 #      else
-#        define ATTRIBUTES  __attribute__((target("+crypto")))
+	 /*
+	  * With gcc, only "+crypto" works.  Both the "+" prefix and the
+	  * "crypto" (not "aes") are essential...
+	  */
+#        define ATTRIBUTES  _target_attribute("+crypto")
 #      endif
 #    endif
 #  endif
@@ -445,17 +473,25 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 static u32 ATTRIBUTES MAYBE_UNUSED
 crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 {
-	const poly64x2_t multipliers_4 = (poly64x2_t)CRC32_4VECS_MULTS;
-	const poly64x2_t multipliers_2 = (poly64x2_t)CRC32_2VECS_MULTS;
-	const poly64x2_t multipliers_1 = (poly64x2_t)CRC32_1VECS_MULTS;
-	const uint8x16_t zeroes = (uint8x16_t){ 0 };
-	const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF };
+	static const u64 _aligned_attribute(16) mults[3][2] = {
+		CRC32_1VECS_MULTS,
+		CRC32_4VECS_MULTS,
+		CRC32_2VECS_MULTS,
+	};
+	static const u64 _aligned_attribute(16) final_mults[3][2] = {
+		{ CRC32_FINAL_MULT, 0 },
+		{ CRC32_BARRETT_CONSTANT_1, 0 },
+		{ CRC32_BARRETT_CONSTANT_2, 0 },
+	};
+	const uint8x16_t zeroes = vdupq_n_u8(0);
+	const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
+	const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
 	uint8x16_t v0, v1, v2, v3;
 
 	if (len < 64 + 15) {
 		if (len < 16)
 			return crc32_slice1(crc, p, len);
-		v0 = vld1q_u8(p) ^ (uint8x16_t)(uint32x4_t){ crc };
+		v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
 		p += 16;
 		len -= 16;
 		while (len >= 16) {
@@ -464,10 +500,12 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 			len -= 16;
 		}
 	} else {
+		const poly64x2_t multipliers_4 = load_multipliers(mults[1]);
+		const poly64x2_t multipliers_2 = load_multipliers(mults[2]);
 		const size_t align = -(uintptr_t)p & 15;
 		const uint8x16_t *vp;
 
-		v0 = vld1q_u8(p) ^ (uint8x16_t)(uint32x4_t){ crc };
+		v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
 		p += 16;
 		/* Align p to the next 16-byte boundary. */
 		if (align) {
@@ -508,21 +546,19 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 	 * which is equivalent to multiplying by x^32.  This is needed because
 	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
 	 */
-	v0 = vextq_u8(v0, zeroes, 8) ^
-	     (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v0),
-				   CRC32_1VECS_MULT_2);
+
+	v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
+		      clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
 
 	/* Fold 96 => 64 bits. */
-	v0 = vextq_u8(v0, zeroes, 4) ^
-		(uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v0 & mask32),
-				      CRC32_FINAL_MULT);
+	v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
+		      clmul_low(vandq_u8(v0, mask32),
+				load_multipliers(final_mults[0])));
 
 	/* Reduce 64 => 32 bits using Barrett reduction. */
-	v1 = (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v0 & mask32),
-				   CRC32_BARRETT_CONSTANT_1);
-	v1 = (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(v1 & mask32),
-				   CRC32_BARRETT_CONSTANT_2);
-	return ((uint32x4_t)(v0 ^ v1))[1];
+	v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
+	v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
+	return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
 }
 #undef SUFFIX
 #undef ATTRIBUTES
@@ -535,18 +571,16 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
  *
  * See crc32_pmull_wide.h for explanation.
  */
-#if defined(__aarch64__) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \
-	((HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE) || \
-	 (HAVE_PMULL_TARGET && HAVE_CRC32_TARGET))
+#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN
 #  define crc32_arm_pmullx12_crc	crc32_arm_pmullx12_crc
 #  define SUFFIX				 _pmullx12_crc
-#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE
+#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
 #    define ATTRIBUTES
 #  else
 #    ifdef __clang__
-#      define ATTRIBUTES  __attribute__((target("crypto,crc")))
+#      define ATTRIBUTES  _target_attribute("aes,crc")
 #    else
-#      define ATTRIBUTES  __attribute__((target("+crypto,+crc")))
+#      define ATTRIBUTES  _target_attribute("+crypto,+crc")
 #    endif
 #  endif
 #  define ENABLE_EOR3	0
@@ -562,25 +596,25 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
  * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than
  * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3.
  */
-#if defined(__aarch64__) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \
-	((HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE) || \
-	 (HAVE_PMULL_TARGET && HAVE_CRC32_TARGET && HAVE_SHA3_TARGET))
+#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \
+	(HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE)
 #  define crc32_arm_pmullx12_crc_eor3	crc32_arm_pmullx12_crc_eor3
 #  define SUFFIX				 _pmullx12_crc_eor3
-#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE
+#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE && \
+	!USE_PMULL_TARGET_EVEN_IF_NATIVE
 #    define ATTRIBUTES
 #  else
 #    ifdef __clang__
-#      define ATTRIBUTES  __attribute__((target("crypto,crc,sha3")))
+#      define ATTRIBUTES  _target_attribute("aes,crc,sha3")
      /*
       * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the
       * default target is armv8.3-a or later in which case it must be omitted.
       * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
       */
 #    elif defined(__ARM_FEATURE_JCVT)
-#      define ATTRIBUTES  __attribute__((target("+crypto,+crc,+sha3")))
+#      define ATTRIBUTES  _target_attribute("+crypto,+crc,+sha3")
 #    else
-#      define ATTRIBUTES  __attribute__((target("arch=armv8.2-a+crypto+crc+sha3")))
+#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+crypto+crc+sha3")
 #    endif
 #  endif
 #  define ENABLE_EOR3	1
diff --git a/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h b/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h
index 02dfc3ae0..1cd1cc188 100644
--- a/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h
+++ b/2.0/libdeflate/lib/arm/crc32_pmull_helpers.h
@@ -39,21 +39,51 @@
 
 #include <arm_neon.h>
 
+/* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */
+#undef u32_to_bytevec
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(u32_to_bytevec)(u32 a)
+{
+	return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
+}
+#define u32_to_bytevec	ADD_SUFFIX(u32_to_bytevec)
+
+/* Load two 64-bit values into a vector. */
+#undef load_multipliers
+static forceinline ATTRIBUTES poly64x2_t
+ADD_SUFFIX(load_multipliers)(const u64 p[2])
+{
+	return vreinterpretq_p64_u64(vld1q_u64(p));
+}
+#define load_multipliers	ADD_SUFFIX(load_multipliers)
+
+/* Do carryless multiplication of the low halves of two vectors. */
+#undef clmul_low
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
+{
+	return vreinterpretq_u8_p128(
+		     compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
+				      vgetq_lane_p64(b, 0)));
+}
+#define clmul_low	ADD_SUFFIX(clmul_low)
+
+/* Do carryless multiplication of the high halves of two vectors. */
 #undef clmul_high
-static forceinline ATTRIBUTES poly128_t
-ADD_SUFFIX(clmul_high)(poly64x2_t a, poly64x2_t b)
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
 {
-#if defined(__clang__) && defined(__aarch64__)
+#if defined(__clang__) && defined(ARCH_ARM64)
 	/*
 	 * Use inline asm to ensure that pmull2 is really used.  This works
 	 * around clang bug https://github.com/llvm/llvm-project/issues/52868.
 	 */
-	poly128_t res;
+	uint8x16_t res;
 
 	__asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
 	return res;
 #else
-	return vmull_high_p64(a, b);
+	return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
 #endif
 }
 #define clmul_high	ADD_SUFFIX(clmul_high)
@@ -73,7 +103,7 @@ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
 	return res;
 #endif
 #else /* ENABLE_EOR3 */
-	return a ^ b ^ c;
+	return veorq_u8(veorq_u8(a, b), c);
 #endif /* !ENABLE_EOR3 */
 }
 #define eor3	ADD_SUFFIX(eor3)
@@ -82,15 +112,10 @@ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
 static forceinline ATTRIBUTES uint8x16_t
 ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
 {
-	/*
-	 * Using vget_low_* instead of vector indexing is necessary to avoid
-	 * poor code generation with gcc on arm32.
-	 */
-	poly128_t a = vmull_p64((poly64_t)vget_low_u8(src),
-				(poly64_t)vget_low_p64(multipliers));
-	poly128_t b = clmul_high((poly64x2_t)src, multipliers);
+	uint8x16_t a = clmul_low(src, multipliers);
+	uint8x16_t b = clmul_high(src, multipliers);
 
-	return eor3((uint8x16_t)a, (uint8x16_t)b, dst);
+	return eor3(a, b, dst);
 }
 #define fold_vec	ADD_SUFFIX(fold_vec)
 
@@ -98,7 +123,7 @@ ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
 static forceinline ATTRIBUTES uint8x16_t
 ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices)
 {
-#ifdef __aarch64__
+#ifdef ARCH_ARM64
 	return vqtbl1q_u8(table, indices);
 #else
 	uint8x8x2_t tab2;
@@ -144,7 +169,8 @@ ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
 	x0 = vtbl(v, lshift);
 
 	/* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
-	bsl_mask = (uint8x16_t)vshrq_n_s8((int8x16_t)rshift, 7);
+	bsl_mask = vreinterpretq_u8_s8(
+			vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
 
 	/*
 	 * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
diff --git a/2.0/libdeflate/lib/arm/crc32_pmull_wide.h b/2.0/libdeflate/lib/arm/crc32_pmull_wide.h
index bf0d722a0..a72e1d876 100644
--- a/2.0/libdeflate/lib/arm/crc32_pmull_wide.h
+++ b/2.0/libdeflate/lib/arm/crc32_pmull_wide.h
@@ -45,7 +45,9 @@
  * Apple M1 processor is an example of such a CPU.
  */
 
-#include <arm_acle.h>
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
 #include <arm_neon.h>
 
 #include "crc32_pmull_helpers.h"
@@ -53,22 +55,24 @@
 static u32 ATTRIBUTES MAYBE_UNUSED
 ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 {
-	const poly64x2_t multipliers_12 = (poly64x2_t)CRC32_12VECS_MULTS;
-	const poly64x2_t multipliers_6 = (poly64x2_t)CRC32_6VECS_MULTS;
-	const poly64x2_t multipliers_4 = (poly64x2_t)CRC32_4VECS_MULTS;
-	const poly64x2_t multipliers_3 = (poly64x2_t)CRC32_3VECS_MULTS;
-	const poly64x2_t multipliers_2 = (poly64x2_t)CRC32_2VECS_MULTS;
-	const poly64x2_t multipliers_1 = (poly64x2_t)CRC32_1VECS_MULTS;
 	uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
 
 	if (len < 3 * 192) {
+		static const u64 _aligned_attribute(16) mults[3][2] = {
+			CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS,
+		};
+		poly64x2_t multipliers_4, multipliers_2, multipliers_1;
+
 		if (len < 64)
 			goto tail;
+		multipliers_4 = load_multipliers(mults[0]);
+		multipliers_2 = load_multipliers(mults[1]);
+		multipliers_1 = load_multipliers(mults[2]);
 		/*
 		 * Short length; don't bother aligning the pointer, and fold
 		 * 64 bytes (4 vectors) at a time, at most.
 		 */
-		v0 = vld1q_u8(p + 0) ^ (uint8x16_t)(uint32x4_t){ crc };
+		v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc));
 		v1 = vld1q_u8(p + 16);
 		v2 = vld1q_u8(p + 32);
 		v3 = vld1q_u8(p + 48);
@@ -92,6 +96,14 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 		}
 		v0 = fold_vec(v0, v1, multipliers_1);
 	} else {
+		static const u64 _aligned_attribute(16) mults[4][2] = {
+			CRC32_12VECS_MULTS, CRC32_6VECS_MULTS,
+			CRC32_3VECS_MULTS, CRC32_1VECS_MULTS,
+		};
+		const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
+		const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
+		const poly64x2_t multipliers_3 = load_multipliers(mults[2]);
+		const poly64x2_t multipliers_1 = load_multipliers(mults[3]);
 		const size_t align = -(uintptr_t)p & 15;
 		const uint8x16_t *vp;
 
@@ -114,7 +126,7 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 			len -= align;
 		}
 		vp = (const uint8x16_t *)p;
-		v0 = *vp++ ^ (uint8x16_t)(uint32x4_t){ crc };
+		v0 = veorq_u8(*vp++, u32_to_bytevec(crc));
 		v1 = *vp++;
 		v2 = *vp++;
 		v3 = *vp++;
@@ -177,8 +189,8 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 		p = (const u8 *)vp;
 	}
 	/* Reduce 128 to 32 bits using crc32 instructions. */
-	crc = __crc32d(0, (u64)vget_low_u8(v0));
-	crc = __crc32d(crc, (u64)vget_high_u8(v0));
+	crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0));
+	crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1));
 tail:
 	/* Finish up the remainder using crc32 instructions. */
 	if (len & 32) {
diff --git a/2.0/libdeflate/lib/arm/matchfinder_impl.h b/2.0/libdeflate/lib/arm/matchfinder_impl.h
index 4b10ba2f8..b20f56a3b 100644
--- a/2.0/libdeflate/lib/arm/matchfinder_impl.h
+++ b/2.0/libdeflate/lib/arm/matchfinder_impl.h
@@ -36,11 +36,7 @@ static forceinline void
 matchfinder_init_neon(mf_pos_t *data, size_t size)
 {
 	int16x8_t *p = (int16x8_t *)data;
-	int16x8_t v = (int16x8_t) {
-		MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
-		MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
-		MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
-	};
+	int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL);
 
 	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
 	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
@@ -61,12 +57,7 @@ static forceinline void
 matchfinder_rebase_neon(mf_pos_t *data, size_t size)
 {
 	int16x8_t *p = (int16x8_t *)data;
-	int16x8_t v = (int16x8_t) {
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-	};
+	int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE);
 
 	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
 	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
diff --git a/2.0/libdeflate/lib/bt_matchfinder.h b/2.0/libdeflate/lib/bt_matchfinder.h
index d5b2dd561..b247d4bcc 100644
--- a/2.0/libdeflate/lib/bt_matchfinder.h
+++ b/2.0/libdeflate/lib/bt_matchfinder.h
@@ -85,7 +85,7 @@ struct lz_match {
 	u16 offset;
 };
 
-struct bt_matchfinder {
+struct MATCHFINDER_ALIGNED bt_matchfinder {
 
 	/* The hash table for finding length 3 matches  */
 	mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
@@ -98,8 +98,7 @@ struct bt_matchfinder {
 	 * children of the node for the sequence with position 'pos' are
 	 * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
 	mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
-
-} MATCHFINDER_ALIGNED;
+};
 
 /* Prepare the matchfinder for a new input buffer.  */
 static forceinline void
diff --git a/2.0/libdeflate/lib/cpu_features_common.h b/2.0/libdeflate/lib/cpu_features_common.h
index f23493816..3019ba28d 100644
--- a/2.0/libdeflate/lib/cpu_features_common.h
+++ b/2.0/libdeflate/lib/cpu_features_common.h
@@ -29,9 +29,11 @@
 #define LIB_CPU_FEATURES_COMMON_H
 
 #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
-#  undef _ANSI_SOURCE	/* for strdup() and strtok_r() */
+   /* for strdup() and strtok_r() */
+#  undef _ANSI_SOURCE
 #  ifndef __APPLE__
-#    define _GNU_SOURCE 1
+#    undef _GNU_SOURCE
+#    define _GNU_SOURCE
 #  endif
 #  include <stdio.h>
 #  include <stdlib.h>
diff --git a/2.0/libdeflate/lib/crc32.c b/2.0/libdeflate/lib/crc32.c
index 5b55c3dba..c3a4da48b 100644
--- a/2.0/libdeflate/lib/crc32.c
+++ b/2.0/libdeflate/lib/crc32.c
@@ -169,7 +169,6 @@
  */
 
 #include "lib_common.h"
-#include "libdeflate.h"
 #include "crc32_multipliers.h"
 #include "crc32_tables.h"
 
@@ -223,9 +222,9 @@ crc32_slice1(u32 crc, const u8 *p, size_t len)
 #undef DEFAULT_IMPL
 #undef arch_select_crc32_func
 typedef u32 (*crc32_func_t)(u32 crc, const u8 *p, size_t len);
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
 #  include "arm/crc32_impl.h"
-#elif defined(__i386__) || defined(__x86_64__)
+#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #  include "x86/crc32_impl.h"
 #endif
 
@@ -254,7 +253,7 @@ static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len)
 #define crc32_impl DEFAULT_IMPL
 #endif
 
-LIBDEFLATEEXPORT u32 LIBDEFLATEAPI
+LIBDEFLATEAPI u32
 libdeflate_crc32(u32 crc, const void *p, size_t len)
 {
 	if (p == NULL) /* Return initial value. */
diff --git a/2.0/libdeflate/lib/crc32_table.h b/2.0/libdeflate/lib/crc32_table.h
deleted file mode 100644
index 05421b982..000000000
--- a/2.0/libdeflate/lib/crc32_table.h
+++ /dev/null
@@ -1,526 +0,0 @@
-/*
- * crc32_table.h - data table to accelerate CRC-32 computation
- *
- * THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c.  DO NOT EDIT.
- */
-
-#include <stdint.h>
-
-static const uint32_t crc32_table[] = {
-	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
-	0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
-	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
-	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
-	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
-	0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
-	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
-	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
-	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
-	0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
-	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
-	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
-	0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
-	0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
-	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
-	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
-	0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
-	0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
-	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
-	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
-	0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
-	0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
-	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
-	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
-	0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
-	0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
-	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
-	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
-	0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
-	0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
-	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
-	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
-	0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
-	0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
-	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
-	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
-	0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
-	0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
-	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
-	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
-	0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
-	0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
-	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
-#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
-	0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
-	0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
-	0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
-	0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
-	0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
-	0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
-	0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
-	0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
-	0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
-	0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
-	0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
-	0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
-	0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
-	0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
-	0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
-	0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
-	0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
-	0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
-	0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
-	0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
-	0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
-	0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
-	0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
-	0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
-	0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
-	0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
-	0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
-	0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
-	0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
-	0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
-	0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
-	0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
-	0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
-	0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
-	0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
-	0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
-	0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
-	0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
-	0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
-	0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
-	0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
-	0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
-	0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
-	0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
-	0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
-	0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
-	0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
-	0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
-	0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
-	0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
-	0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
-	0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
-	0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
-	0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
-	0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
-	0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
-	0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
-	0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
-	0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
-	0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
-	0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
-	0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
-	0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
-	0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
-	0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
-	0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
-	0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
-	0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
-	0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
-	0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
-	0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
-	0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
-	0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
-	0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
-	0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
-	0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
-	0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
-	0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
-	0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
-	0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
-	0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
-	0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
-	0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
-	0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
-	0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
-	0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
-	0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
-	0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
-	0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
-	0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
-	0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
-	0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
-	0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
-	0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
-	0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
-	0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
-	0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
-	0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
-	0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
-	0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
-	0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
-	0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
-	0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
-	0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
-	0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
-	0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
-	0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
-	0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
-	0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
-	0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
-	0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
-	0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
-	0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
-	0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
-	0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
-	0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
-	0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
-	0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
-	0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
-	0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
-	0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
-	0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
-	0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
-	0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
-	0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
-	0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
-	0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
-	0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
-	0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
-	0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
-	0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
-	0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
-	0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
-	0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
-	0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
-	0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
-	0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
-	0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
-	0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
-	0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
-	0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
-	0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
-	0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
-	0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
-	0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
-	0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
-	0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
-	0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
-	0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
-	0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
-	0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
-	0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
-	0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
-	0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
-	0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
-	0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
-	0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
-	0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
-	0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
-	0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
-	0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
-	0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
-	0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
-	0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
-	0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
-	0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
-	0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
-	0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
-	0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
-	0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
-	0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
-	0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
-	0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
-	0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
-	0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
-	0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
-	0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
-	0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
-	0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
-	0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
-	0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
-	0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
-	0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
-	0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
-	0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
-	0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
-	0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
-	0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
-	0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
-	0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
-	0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
-	0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
-#endif /* CRC32_SLICE4 || CRC32_SLICE8 */
-#if defined(CRC32_SLICE8)
-	0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
-	0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
-	0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
-	0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
-	0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
-	0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
-	0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
-	0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
-	0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
-	0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
-	0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
-	0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
-	0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
-	0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
-	0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
-	0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
-	0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
-	0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
-	0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
-	0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
-	0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
-	0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
-	0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
-	0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
-	0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
-	0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
-	0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
-	0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
-	0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
-	0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
-	0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
-	0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
-	0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
-	0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
-	0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
-	0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
-	0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
-	0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
-	0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
-	0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
-	0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
-	0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
-	0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
-	0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
-	0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
-	0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
-	0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
-	0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
-	0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
-	0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
-	0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
-	0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
-	0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
-	0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
-	0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
-	0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
-	0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
-	0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
-	0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
-	0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
-	0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
-	0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
-	0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
-	0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
-	0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
-	0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
-	0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
-	0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
-	0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
-	0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
-	0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
-	0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
-	0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
-	0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
-	0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
-	0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
-	0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
-	0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
-	0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
-	0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
-	0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
-	0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
-	0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
-	0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
-	0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
-	0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
-	0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
-	0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
-	0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
-	0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
-	0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
-	0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
-	0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
-	0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
-	0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
-	0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
-	0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
-	0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
-	0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
-	0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
-	0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
-	0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
-	0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
-	0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
-	0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
-	0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
-	0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
-	0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
-	0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
-	0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
-	0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
-	0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
-	0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
-	0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
-	0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
-	0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
-	0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
-	0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
-	0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
-	0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
-	0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
-	0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
-	0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
-	0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
-	0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
-	0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
-	0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
-	0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
-	0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
-	0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
-	0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
-	0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
-	0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
-	0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
-	0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
-	0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
-	0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
-	0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
-	0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
-	0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
-	0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
-	0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
-	0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
-	0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
-	0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
-	0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
-	0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
-	0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
-	0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
-	0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
-	0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
-	0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
-	0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
-	0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
-	0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
-	0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
-	0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
-	0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
-	0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
-	0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
-	0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
-	0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
-	0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
-	0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
-	0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
-	0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
-	0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
-	0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
-	0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
-	0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
-	0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
-	0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
-	0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
-	0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
-	0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
-	0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
-	0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
-	0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
-	0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
-	0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
-	0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
-	0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
-	0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
-	0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
-	0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
-	0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
-	0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
-	0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
-	0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
-	0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
-	0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
-	0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
-	0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
-	0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
-	0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
-	0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
-	0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
-	0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
-	0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
-	0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
-	0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
-	0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
-	0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
-	0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
-	0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
-	0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
-	0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
-	0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
-	0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
-	0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
-	0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
-	0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
-	0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
-	0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
-	0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
-	0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
-	0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
-	0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
-	0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
-	0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
-	0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
-	0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
-	0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
-	0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
-	0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
-	0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
-	0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
-	0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
-	0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
-	0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
-	0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
-	0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
-	0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
-	0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
-	0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
-	0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
-	0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
-	0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
-	0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
-	0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
-	0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
-	0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
-	0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
-	0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
-	0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
-	0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
-	0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
-	0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
-	0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
-	0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
-	0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
-	0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
-	0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
-	0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
-	0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
-	0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
-#endif /* CRC32_SLICE8 */
-};
diff --git a/2.0/libdeflate/lib/crc32_vec_template.h b/2.0/libdeflate/lib/crc32_vec_template.h
deleted file mode 100644
index 9a2ad5bde..000000000
--- a/2.0/libdeflate/lib/crc32_vec_template.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * crc32_vec_template.h - template for vectorized CRC-32 implementations
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#define CRC32_SLICE1	1
-static u32 crc32_slice1(u32, const u8 *, size_t);
-
-/*
- * Template for vectorized CRC-32 implementations.
- *
- * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
- * of crc32_slice8() because only a few bytes need to be processed, so a smaller
- * table is preferable.
- */
-static u32 ATTRIBUTES
-FUNCNAME(u32 remainder, const u8 *p, size_t size)
-{
-	if ((uintptr_t)p % IMPL_ALIGNMENT) {
-		size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT);
-
-		remainder = crc32_slice1(remainder, p, n);
-		p += n;
-		size -= n;
-	}
-	if (size >= IMPL_SEGMENT_SIZE) {
-		remainder = FUNCNAME_ALIGNED(remainder, (const void *)p,
-					     size / IMPL_SEGMENT_SIZE);
-		p += size - (size % IMPL_SEGMENT_SIZE);
-		size %= IMPL_SEGMENT_SIZE;
-	}
-	return crc32_slice1(remainder, p, size);
-}
-
-#undef FUNCNAME
-#undef FUNCNAME_ALIGNED
-#undef ATTRIBUTES
-#undef IMPL_ALIGNMENT
-#undef IMPL_SEGMENT_SIZE
diff --git a/2.0/libdeflate/lib/decompress_template.h b/2.0/libdeflate/lib/decompress_template.h
index a8cad0053..6036d0bca 100644
--- a/2.0/libdeflate/lib/decompress_template.h
+++ b/2.0/libdeflate/lib/decompress_template.h
@@ -179,11 +179,11 @@ FUNCNAME(struct libdeflate_decompressor * restrict d,
 			/* Run-length encoded codeword lengths */
 
 			/*
-			 * Note: we don't need verify that the repeat count
-			 * doesn't overflow the number of elements, since we've
-			 * sized the lens array to have enough extra space to
-			 * allow for the worst-case overrun (138 zeroes when
-			 * only 1 length was remaining).
+			 * Note: we don't need to immediately verify that the
+			 * repeat count doesn't overflow the number of elements,
+			 * since we've sized the lens array to have enough extra
+			 * space to allow for the worst-case overrun (138 zeroes
+			 * when only 1 length was remaining).
 			 *
 			 * In the case of the small repeat counts (presyms 16
 			 * and 17), it is fastest to always write the maximum
@@ -241,6 +241,9 @@ FUNCNAME(struct libdeflate_decompressor * restrict d,
 			}
 		} while (i < num_litlen_syms + num_offset_syms);
 
+		/* Unnecessary, but check this for consistency with zlib. */
+		SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
+
 	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
 		u16 len, nlen;
 
diff --git a/2.0/libdeflate/lib/deflate_compress.c b/2.0/libdeflate/lib/deflate_compress.c
index f957987c4..11c007e18 100644
--- a/2.0/libdeflate/lib/deflate_compress.c
+++ b/2.0/libdeflate/lib/deflate_compress.c
@@ -28,8 +28,6 @@
 #include "deflate_compress.h"
 #include "deflate_constants.h"
 
-#include "libdeflate.h"
-
 /******************************************************************************/
 
 /*
@@ -136,7 +134,8 @@
  * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
  * value isn't very useful since the calculations are approximate anyway.
  *
- * BIT_COST doesn't apply to deflate_flush_block(), which considers whole bits.
+ * BIT_COST doesn't apply to deflate_flush_block() and
+ * deflate_compute_true_cost(), which consider whole bits.
  */
 #define BIT_COST	16
 
@@ -286,46 +285,26 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
 };
 
 /*
- * A condensed table which maps offset => offset slot as follows:
- *
- *	offset <= 256: deflate_offset_slot[offset]
- *	offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
- *
- * This table was generated by scripts/gen_offset_slot_map.py.
+ * Table: 'offset - 1 => offset_slot' for offset <= 256.
+ * This was generated by scripts/gen_offset_slot_map.py.
  */
-static const u8 deflate_offset_slot[512] = {
-	0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
-	7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
-	9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-	10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-	11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+static const u8 deflate_offset_slot[256] = {
+	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-	12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
-	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
-	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 };
 
 /* The order in which precode codeword lengths are stored */
@@ -478,6 +457,9 @@ struct libdeflate_compressor {
 	void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
 		     size_t in_nbytes, struct deflate_output_bitstream *os);
 
+	/* The free() function for this struct, chosen at allocation time */
+	free_func_t free_func;
+
 	/* The compression level with which this compressor was created */
 	unsigned compression_level;
 
@@ -604,6 +586,9 @@ struct libdeflate_compressor {
 			/* The current cost model being used */
 			struct deflate_costs costs;
 
+			/* Saved cost model */
+			struct deflate_costs costs_saved;
+
 			/*
 			 * A table that maps match offset to offset slot.  This
 			 * differs from deflate_offset_slot[] in that this is a
@@ -628,7 +613,48 @@ struct libdeflate_compressor {
 			u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
 			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
 
-			unsigned num_optim_passes;
+			/*
+			 * The maximum number of optimization passes
+			 * (min-cost path searches) per block.
+			 * Larger values = more compression.
+			 */
+			unsigned max_optim_passes;
+
+			/*
+			 * If an optimization pass improves the cost by fewer
+			 * than this number of bits, then optimization will stop
+			 * early, before max_optim_passes has been reached.
+			 * Smaller values = more compression.
+			 */
+			unsigned min_improvement_to_continue;
+
+			/*
+			 * The minimum number of bits that would need to be
+			 * saved for it to be considered worth the time to
+			 * regenerate and use the min-cost path from a previous
+			 * optimization pass, in the case where the final
+			 * optimization pass actually increased the cost.
+			 * Smaller values = more compression.
+			 */
+			unsigned min_bits_to_use_nonfinal_path;
+
+			/*
+			 * The maximum block length, in uncompressed bytes, at
+			 * which to find and consider the optimal match/literal
+			 * list for the static Huffman codes.  This strategy
+			 * improves the compression ratio produced by static
+			 * Huffman blocks and can discover more cases in which
+			 * static blocks are worthwhile.  This helps mostly with
+			 * small blocks, hence why this parameter is a max_len.
+			 *
+			 * Above this block length, static Huffman blocks are
+			 * only used opportunistically.  I.e. a static Huffman
+			 * block is only used if a static block using the same
+			 * match/literal list as the optimized dynamic block
+			 * happens to be cheaper than the dynamic block itself.
+			 */
+			unsigned max_len_to_optimize_static_block;
+
 		} n; /* (n)ear-optimal */
 	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
@@ -676,24 +702,12 @@ struct deflate_output_bitstream {
 	 */
 	u8 *next;
 
-	/*
-	 * Pointer to near the end of the output buffer.  'next' will never
-	 * exceed this.  There are OUTPUT_END_PADDING bytes reserved after this
-	 * to allow branchlessly writing a whole word at this location.
-	 */
+	/* Pointer to the end of the output buffer */
 	u8 *end;
-};
 
-/*
- * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
- * present following os->end, in order to not overrun the buffer when generating
- * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
- * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
- * to make the compression algorithm produce the same result on all CPU
- * architectures (which is sometimes desirable), we have to unconditionally use
- * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
- */
-#define OUTPUT_END_PADDING	8
+	/* true if the output buffer ran out of space */
+	bool overflow;
+};
 
 /*
  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
@@ -707,21 +721,29 @@ do {						\
 	ASSERT(bitcount <= BITBUF_NBITS);	\
 } while (0)
 
-/* Flush bits from the bitbuffer variable to the output buffer. */
+/*
+ * Flush bits from the bitbuffer variable to the output buffer.  After this, the
+ * bitbuffer will contain at most 7 bits (a partial byte).
+ *
+ * Since deflate_flush_block() verified ahead of time that there is enough space
+ * remaining before actually writing the block, it's guaranteed that out_next
+ * won't exceed os->end.  However, there might not be enough space remaining to
+ * flush a whole word, even though that's fastest.  Therefore, flush a whole
+ * word if there is space for it, otherwise flush a byte at a time.
+ */
 #define FLUSH_BITS()							\
 do {									\
-	if (UNALIGNED_ACCESS_IS_FAST) {					\
+	if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \
 		/* Flush a whole word (branchlessly). */		\
 		put_unaligned_leword(bitbuf, out_next);			\
 		bitbuf >>= bitcount & ~7;				\
-		out_next += MIN((size_t)(out_end - out_next), bitcount >> 3); \
+		out_next += bitcount >> 3;				\
 		bitcount &= 7;						\
 	} else {							\
 		/* Flush a byte at a time. */				\
 		while (bitcount >= 8) {					\
-			*out_next = bitbuf;				\
-			if (out_next != out_end)			\
-				out_next++;				\
+			ASSERT(out_next < os->end);			\
+			*out_next++ = bitbuf;				\
 			bitcount -= 8;					\
 			bitbuf >>= 8;					\
 		}							\
@@ -771,8 +793,7 @@ heapify_array(u32 A[], unsigned length)
  * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
  *
  * Note: name this function heap_sort() instead of heapsort() to avoid colliding
- * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
- * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
+ * with heapsort() from stdlib.h on BSD-derived systems.
  */
 static void
 heap_sort(u32 A[], unsigned length)
@@ -1311,7 +1332,6 @@ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 	 * eventually return the codewords.
 	 */
 	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
-
 	/*
 	 * 'num_used_syms' is the number of symbols with nonzero frequency.
 	 * This may be less than @num_syms.  'num_used_syms' is also the number
@@ -1320,30 +1340,34 @@ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 	 */
 
 	/*
-	 * Handle special cases where only 0 or 1 symbols were used (had nonzero
-	 * frequency).
+	 * A complete Huffman code must contain at least 2 codewords.  Yet, it's
+	 * possible that fewer than 2 symbols were used.  When this happens,
+	 * it's usually for the offset code (0-1 symbols used).  But it's also
+	 * theoretically possible for the litlen and pre codes (1 symbol used).
+	 *
+	 * The DEFLATE RFC explicitly allows the offset code to contain just 1
+	 * codeword, or even be completely empty.  But it's silent about the
+	 * other codes.  It also doesn't say whether, in the 1-codeword case,
+	 * the codeword (which it says must be 1 bit) is '0' or '1'.
+	 *
+	 * In any case, some DEFLATE decompressors reject these cases.  zlib
+	 * generally allows them, but it does reject precodes that have just 1
+	 * codeword.  More problematically, zlib v1.2.1 and earlier rejected
+	 * empty offset codes, and this behavior can also be seen in Windows
+	 * Explorer's ZIP unpacker (supposedly even still in Windows 11).
+	 *
+	 * Other DEFLATE compressors, including zlib, always send at least 2
+	 * codewords in order to make a complete Huffman code.  Therefore, this
+	 * is a case where practice does not entirely match the specification.
+	 * We follow practice by generating 2 codewords of length 1: codeword
+	 * '0' for symbol 0, and codeword '1' for another symbol -- the used
+	 * symbol if it exists and is not symbol 0, otherwise symbol 1.  This
+	 * does worsen the compression ratio by having to send an unnecessary
+	 * offset codeword length.  But this only affects rare cases such as
+	 * blocks containing all literals, and it only makes a tiny difference.
 	 */
-
-	if (unlikely(num_used_syms == 0)) {
-		/*
-		 * Code is empty.  sort_symbols() already set all lengths to 0,
-		 * so there is nothing more to do.
-		 */
-		return;
-	}
-
-	if (unlikely(num_used_syms == 1)) {
-		/*
-		 * Only one symbol was used, so we only need one codeword.  But
-		 * two codewords are needed to form the smallest complete
-		 * Huffman code, which uses codewords 0 and 1.  Therefore, we
-		 * choose another symbol to which to assign a codeword.  We use
-		 * 0 (if the used symbol is not 0) or 1 (if the used symbol is
-		 * 0).  In either case, the lesser-valued symbol must be
-		 * assigned codeword 0 so that the resulting code is canonical.
-		 */
-
-		unsigned sym = A[0] & SYMBOL_MASK;
+	if (unlikely(num_used_syms < 2)) {
+		unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0;
 		unsigned nonzero_idx = sym ? sym : 1;
 
 		codewords[0] = 0;
@@ -1427,20 +1451,30 @@ deflate_init_static_codes(struct libdeflate_compressor *c)
 
 /* Return the offset slot for the given match offset, using the small map. */
 static forceinline unsigned
-deflate_get_offset_slot(unsigned offset)
+deflate_get_offset_slot(u32 offset)
 {
-#if 1
-	if (offset <= 256)
-		return deflate_offset_slot[offset];
-	else
-		return deflate_offset_slot[256 + ((offset - 1) >> 7)];
-#else /* Branchless version */
-	u32 i1 = offset;
-	u32 i2 = 256 + ((offset - 1) >> 7);
-	u32 is_small = (s32)(offset - 257) >> 31;
+	/*
+	 * 1 <= offset <= 32768 here.  For 1 <= offset <= 256,
+	 * deflate_offset_slot[offset - 1] gives the slot.
+	 *
+	 * For 257 <= offset <= 32768, we take advantage of the fact that 257 is
+	 * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 ==
+	 * 128 times larger than each slot [2..16) (since the number of extra
+	 * bits increases by 1 every 2 slots).  Thus, the slot is:
+	 *
+	 *	deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2)
+	 *   == deflate_offset_slot[((offset - 1) >> 7)] + 14
+	 *
+	 * Define 'n = (offset <= 256) ? 0 : 7'.  Then any offset is handled by:
+	 *
+	 *      deflate_offset_slot[(offset - 1) >> n] + (n << 1)
+	 *
+	 * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with
+	 * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'.
+	 */
+	unsigned n = (256 - offset) >> 29;
 
-	return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
-#endif
+	return deflate_offset_slot[(offset - 1) >> n] + (n << 1);
 }
 
 static unsigned
@@ -1660,6 +1694,12 @@ do {									\
 /*
  * Choose the best type of block to use (dynamic Huffman, static Huffman, or
  * uncompressed), then output it.
+ *
+ * The uncompressed data of the block is @block_begin[0..@block_length-1].  The
+ * sequence of literals and matches that will be used to compress the block (if
+ * a compressed block is chosen) is given by @sequences if it's non-NULL, or
+ * else @c->p.n.optimum_nodes.  @c->freqs and @c->codes must be already set
+ * according to the literals, matches, and end-of-block symbol.
  */
 static void
 deflate_flush_block(struct libdeflate_compressor *c,
@@ -1681,29 +1721,26 @@ deflate_flush_block(struct libdeflate_compressor *c,
 	bitbuf_t bitbuf = os->bitbuf;
 	unsigned bitcount = os->bitcount;
 	u8 *out_next = os->next;
-	u8 * const out_end = os->end;
-	/* The cost for each block type, in bits */
-	u32 dynamic_cost = 0;
-	u32 static_cost = 0;
-	u32 uncompressed_cost = 0;
+	u8 * const out_fast_end =
+		os->end - MIN(WORDBYTES - 1, os->end - out_next);
+	/*
+	 * The cost for each block type, in bits.  Start with the cost of the
+	 * block header which is 3 bits.
+	 */
+	u32 dynamic_cost = 3;
+	u32 static_cost = 3;
+	u32 uncompressed_cost = 3;
 	u32 best_cost;
 	struct deflate_codes *codes;
 	unsigned sym;
 
-	ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block);
+	ASSERT(block_length >= MIN_BLOCK_LENGTH ||
+	       (is_final_block && block_length > 0));
 	ASSERT(block_length <= MAX_BLOCK_LENGTH);
 	ASSERT(bitcount <= 7);
 	ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
-	ASSERT(out_next <= out_end);
-
-	if (sequences != NULL /* !near_optimal */ ||
-	    !SUPPORT_NEAR_OPTIMAL_PARSING) {
-		/* Tally the end-of-block symbol. */
-		c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
-
-		/* Build dynamic Huffman codes. */
-		deflate_make_huffman_codes(&c->freqs, &c->codes);
-	} /* Else, this was already done. */
+	ASSERT(out_next <= os->end);
+	ASSERT(!os->overflow);
 
 	/* Precompute the precode items and build the precode. */
 	deflate_precompute_huffman_header(c);
@@ -1761,9 +1798,71 @@ deflate_flush_block(struct libdeflate_compressor *c,
 						 UINT16_MAX) - 1)) +
 			     (8 * block_length);
 
-	/* Choose and output the cheapest type of block. */
-	best_cost = MIN(static_cost, uncompressed_cost);
-	if (dynamic_cost < best_cost) {
+	/*
+	 * Choose and output the cheapest type of block.  If there is a tie,
+	 * prefer uncompressed, then static, then dynamic.
+	 */
+
+	best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost));
+
+	/* If the block isn't going to fit, then stop early. */
+	if (DIV_ROUND_UP(bitcount + best_cost, 8) > (size_t)(os->end - out_next)) {
+		os->overflow = true;
+		return;
+	}
+	/*
+	 * Else, now we know that the block fits, so no further bounds checks on
+	 * the output buffer are required until the next block.
+	 */
+
+	if (best_cost == uncompressed_cost) {
+		/*
+		 * Uncompressed block(s).  DEFLATE limits the length of
+		 * uncompressed blocks to UINT16_MAX bytes, so if the length of
+		 * the "block" we're flushing is over UINT16_MAX, we actually
+		 * output multiple blocks.
+		 */
+		do {
+			u8 bfinal = 0;
+			size_t len = UINT16_MAX;
+
+			if (in_end - in_next <= UINT16_MAX) {
+				bfinal = is_final_block;
+				len = in_end - in_next;
+			}
+			/* It was already checked that there is enough space. */
+			ASSERT(os->end - out_next >=
+			       (long)(DIV_ROUND_UP(bitcount + 3, 8) + 4 + len));
+			/*
+			 * Output BFINAL (1 bit) and BTYPE (2 bits), then align
+			 * to a byte boundary.
+			 */
+			STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
+			*out_next++ = (bfinal << bitcount) | bitbuf;
+			if (bitcount > 5)
+				*out_next++ = 0;
+			bitbuf = 0;
+			bitcount = 0;
+			/* Output LEN and NLEN, then the data itself. */
+			put_unaligned_le16(len, out_next);
+			out_next += 2;
+			put_unaligned_le16(~len, out_next);
+			out_next += 2;
+			memcpy(out_next, in_next, len);
+			out_next += len;
+			in_next += len;
+		} while (in_next != in_end);
+		/* Done outputting uncompressed block(s) */
+		goto out;
+	}
+
+	if (best_cost == static_cost) {
+		/* Static Huffman block */
+		codes = &c->static_codes;
+		ADD_BITS(is_final_block, 1);
+		ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
+		FLUSH_BITS();
+	} else {
 		const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
 		const unsigned num_precode_items = c->o.precode.num_items;
 		unsigned precode_sym, precode_item;
@@ -1771,7 +1870,6 @@ deflate_flush_block(struct libdeflate_compressor *c,
 
 		/* Dynamic Huffman block */
 
-		best_cost = dynamic_cost;
 		codes = &c->codes;
 		STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
 		ADD_BITS(is_final_block, 1);
@@ -1823,54 +1921,6 @@ deflate_flush_block(struct libdeflate_compressor *c,
 				 deflate_extra_precode_bits[precode_sym]);
 			FLUSH_BITS();
 		} while (++i < num_precode_items);
-	} else if (static_cost < uncompressed_cost) {
-		/* Static Huffman block */
-		codes = &c->static_codes;
-		ADD_BITS(is_final_block, 1);
-		ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
-		FLUSH_BITS();
-	} else {
-		/*
-		 * Uncompressed block(s).  DEFLATE limits the length of
-		 * uncompressed blocks to UINT16_MAX bytes, so if the length of
-		 * the "block" we're flushing is over UINT16_MAX, we actually
-		 * output multiple blocks.
-		 */
-		do {
-			u8 bfinal = 0;
-			size_t len = UINT16_MAX;
-
-			if (in_end - in_next <= UINT16_MAX) {
-				bfinal = is_final_block;
-				len = in_end - in_next;
-			}
-			if ((size_t)(out_end - out_next) <
-			    (bitcount + 3 + 7) / 8 + 4 + len) {
-				/* Not enough output space remaining. */
-				out_next = out_end;
-				goto out;
-			}
-			/*
-			 * Output BFINAL (1 bit) and BTYPE (2 bits), then align
-			 * to a byte boundary.
-			 */
-			STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
-			*out_next++ = (bfinal << bitcount) | bitbuf;
-			if (bitcount > 5)
-				*out_next++ = 0;
-			bitbuf = 0;
-			bitcount = 0;
-			/* Output LEN and NLEN, then the data itself. */
-			put_unaligned_le16(len, out_next);
-			out_next += 2;
-			put_unaligned_le16(~len, out_next);
-			out_next += 2;
-			memcpy(out_next, in_next, len);
-			out_next += len;
-			in_next += len;
-		} while (in_next != in_end);
-		/* Done outputting uncompressed block(s) */
-		goto out;
 	}
 
 	/* Output the literals and matches for a dynamic or static block. */
@@ -1974,18 +2024,30 @@ deflate_flush_block(struct libdeflate_compressor *c,
 out:
 	ASSERT(bitcount <= 7);
 	/*
-	 * Assert that the block cost was computed correctly, as
+	 * Assert that the block cost was computed correctly.  This is relied on
+	 * above for the bounds check on the output buffer.  Also,
 	 * libdeflate_deflate_compress_bound() relies on this via the assumption
-	 * that uncompressed blocks will always be used when cheaper.
+	 * that uncompressed blocks will always be used when cheapest.
 	 */
-	ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount ==
-	       3 + best_cost || out_next == out_end);
-
+	ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost);
 	os->bitbuf = bitbuf;
 	os->bitcount = bitcount;
 	os->next = out_next;
 }
 
+static void
+deflate_finish_block(struct libdeflate_compressor *c,
+		     struct deflate_output_bitstream *os,
+		     const u8 *block_begin, u32 block_length,
+		     const struct deflate_sequence *sequences,
+		     bool is_final_block)
+{
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+	deflate_flush_block(c, os, block_begin, block_length, sequences,
+			    is_final_block);
+}
+
 /******************************************************************************/
 
 /*
@@ -2270,6 +2332,13 @@ calculate_min_match_len(const u8 *data, size_t data_len,
 	unsigned num_used_literals = 0;
 	size_t i;
 
+	/*
+	 * For very short inputs, the static Huffman code has a good chance of
+	 * being best, in which case there is no reason to avoid short matches.
+	 */
+	if (data_len < 512)
+		return DEFLATE_MIN_MATCH_LEN;
+
 	/*
 	 * For an initial approximation, scan the first 4 KiB of data.  The
 	 * caller may use recalculate_min_match_len() to update min_len later.
@@ -2446,10 +2515,10 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c,
 		} while (in_next < in_max_block_end &&
 			 seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
 
-		deflate_flush_block(c, os, in_block_begin,
-				    in_next - in_block_begin,
-				    c->p.f.sequences, in_next == in_end);
-	} while (in_next != in_end);
+		deflate_finish_block(c, os, in_block_begin,
+				     in_next - in_block_begin,
+				     c->p.f.sequences, in_next == in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 /*
@@ -2525,10 +2594,10 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
 			 !should_end_block(&c->split_stats,
 					   in_block_begin, in_next, in_end));
 
-		deflate_flush_block(c, os, in_block_begin,
-				    in_next - in_block_begin,
-				    c->p.g.sequences, in_next == in_end);
-	} while (in_next != in_end);
+		deflate_finish_block(c, os, in_block_begin,
+				     in_next - in_block_begin,
+				     c->p.g.sequences, in_next == in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 static forceinline void
@@ -2731,10 +2800,10 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
 			 !should_end_block(&c->split_stats,
 					   in_block_begin, in_next, in_end));
 
-		deflate_flush_block(c, os, in_block_begin,
-				    in_next - in_block_begin,
-				    c->p.g.sequences, in_next == in_end);
-	} while (in_next != in_end);
+		deflate_finish_block(c, os, in_block_begin,
+				     in_next - in_block_begin,
+				     c->p.g.sequences, in_next == in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 /*
@@ -2797,6 +2866,59 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
 	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
 }
 
+static void
+deflate_choose_all_literals(struct libdeflate_compressor *c,
+			    const u8 *block, u32 block_length)
+{
+	u32 i;
+
+	deflate_reset_symbol_frequencies(c);
+	for (i = 0; i < block_length; i++)
+		c->freqs.litlen[block[i]]++;
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+}
+
+/*
+ * Compute the exact cost, in bits, that would be required to output the matches
+ * and literals described by @c->freqs as a dynamic Huffman block.  The litlen
+ * and offset codes are assumed to have already been built in @c->codes.
+ */
+static u32
+deflate_compute_true_cost(struct libdeflate_compressor *c)
+{
+	u32 cost = 0;
+	unsigned sym;
+
+	deflate_precompute_huffman_header(c);
+
+	memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0,
+	       DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms);
+
+	cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
+	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
+		cost += c->o.precode.freqs[sym] *
+			(c->o.precode.lens[sym] +
+			 deflate_extra_precode_bits[sym]);
+	}
+
+	for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++)
+		cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym];
+
+	for (; sym < DEFLATE_FIRST_LEN_SYM +
+	       ARRAY_LEN(deflate_extra_length_bits); sym++)
+		cost += c->freqs.litlen[sym] *
+			(c->codes.lens.litlen[sym] +
+			 deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]);
+
+	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++)
+		cost += c->freqs.offset[sym] *
+			(c->codes.lens.offset[sym] +
+			 deflate_extra_offset_bits[sym]);
+	return cost;
+}
+
 /* Set the current cost model from the codeword lengths specified in @lens. */
 static void
 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
@@ -3124,11 +3246,11 @@ deflate_adjust_costs_impl(struct libdeflate_compressor *c,
 /*
  * Adjust the costs when beginning a new block.
  *
- * Since the current costs have been optimized for the data, it's undesirable to
- * throw them away and start over with the default costs.  At the same time, we
- * don't want to bias the parse by assuming that the next block will be similar
- * to the current block.  As a compromise, make the costs closer to the
- * defaults, but don't simply set them to the defaults.
+ * Since the current costs are optimized for the data already, it can be helpful
+ * to reuse them instead of starting over with the default costs.  However, this
+ * depends on how similar the new block is to the previous block.  Therefore,
+ * use a heuristic to decide how similar the blocks are, and mix together the
+ * current costs and the default costs accordingly.
  */
 static void
 deflate_adjust_costs(struct libdeflate_compressor *c,
@@ -3159,7 +3281,10 @@ deflate_adjust_costs(struct libdeflate_compressor *c,
 	cutoff = ((u64)c->p.n.prev_num_observations *
 		  c->split_stats.num_observations * 200) / 512;
 
-	if (4 * total_delta > 9 * cutoff)
+	if (total_delta > 3 * cutoff)
+		/* Big change in the data; just use the default costs. */
+		deflate_set_default_costs(c, lit_cost, len_sym_cost);
+	else if (4 * total_delta > 9 * cutoff)
 		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
 	else if (2 * total_delta > 3 * cutoff)
 		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
@@ -3169,6 +3294,21 @@ deflate_adjust_costs(struct libdeflate_compressor *c,
 		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
 }
 
+static void
+deflate_set_initial_costs(struct libdeflate_compressor *c,
+			  const u8 *block_begin, u32 block_length,
+			  bool is_first_block)
+{
+	u32 lit_cost, len_sym_cost;
+
+	deflate_choose_default_litlen_costs(c, block_begin, block_length,
+					    &lit_cost, &len_sym_cost);
+	if (is_first_block)
+		deflate_set_default_costs(c, lit_cost, len_sym_cost);
+	else
+		deflate_adjust_costs(c, lit_cost, len_sym_cost);
+}
+
 /*
  * Find the minimum-cost path through the graph of possible match/literal
  * choices for this block.
@@ -3251,29 +3391,52 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
 		}
 		cur_node->cost_to_end = best_cost_to_end;
 	} while (cur_node != &c->p.n.optimum_nodes[0]);
+
+	deflate_reset_symbol_frequencies(c);
+	deflate_tally_item_list(c, block_length);
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
 }
 
 /*
- * Choose the literal/match sequence to use for the current block.  The basic
- * algorithm finds a minimum-cost path through the block's graph of
- * literal/match choices, given a cost model.  However, the cost of each symbol
- * is unknown until the Huffman codes have been built, but at the same time the
- * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
- * multiple passes must be used to try to approximate an optimal solution.  The
- * first pass uses default costs, mixed with the costs from the previous block
- * if any.  Later passes use the Huffman codeword lengths from the previous pass
- * as the costs.
+ * Choose the literals and matches for the current block, then output the block.
+ *
+ * To choose the literal/match sequence, we find the minimum-cost path through
+ * the block's graph of literal/match choices, given a cost model.  However, the
+ * true cost of each symbol is unknown until the Huffman codes have been built,
+ * but at the same time the Huffman codes depend on the frequencies of chosen
+ * symbols.  Consequently, multiple passes must be used to try to approximate an
+ * optimal solution.  The first pass uses default costs, mixed with the costs
+ * from the previous block when it seems appropriate.  Later passes use the
+ * Huffman codeword lengths from the previous pass as the costs.
+ *
+ * As an alternate strategy, also consider using only literals.  The boolean
+ * returned in *used_only_literals indicates whether that strategy was best.
  */
 static void
-deflate_optimize_block(struct libdeflate_compressor *c,
-		       const u8 *block_begin, u32 block_length,
-		       const struct lz_match *cache_ptr, bool is_first_block,
-		       bool is_final_block)
+deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
+				 struct deflate_output_bitstream *os,
+				 const u8 *block_begin, u32 block_length,
+				 const struct lz_match *cache_ptr,
+				 bool is_first_block, bool is_final_block,
+				 bool *used_only_literals)
 {
-	unsigned num_passes_remaining = c->p.n.num_optim_passes;
-	u32 lit_cost, len_sym_cost;
+	unsigned num_passes_remaining = c->p.n.max_optim_passes;
+	u32 best_true_cost = UINT32_MAX;
+	u32 true_cost;
+	u32 only_lits_cost;
+	u32 static_cost = UINT32_MAX;
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq = NULL;
 	u32 i;
 
+	/*
+	 * On some data, using only literals (no matches) ends up being better
+	 * than what the iterative optimization algorithm produces.  Therefore,
+	 * consider using only literals.
+	 */
+	deflate_choose_all_literals(c, block_begin, block_length);
+	only_lits_cost = deflate_compute_true_cost(c);
+
 	/*
 	 * Force the block to really end at the desired length, even if some
 	 * matches extend beyond it.
@@ -3283,33 +3446,86 @@ deflate_optimize_block(struct libdeflate_compressor *c,
 		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
-	/* Set the initial costs. */
-	deflate_choose_default_litlen_costs(c, block_begin, block_length,
-					    &lit_cost, &len_sym_cost);
-	if (is_first_block)
-		deflate_set_default_costs(c, lit_cost, len_sym_cost);
-	else
-		deflate_adjust_costs(c, lit_cost, len_sym_cost);
+	/*
+	 * Sometimes a static Huffman block ends up being cheapest, particularly
+	 * if the block is small.  So, if the block is sufficiently small, find
+	 * the optimal static block solution and remember its cost.
+	 */
+	if (block_length <= c->p.n.max_len_to_optimize_static_block) {
+		/* Save c->p.n.costs temporarily. */
+		c->p.n.costs_saved = c->p.n.costs;
 
-	do {
-		/* Find the minimum cost path for this pass. */
+		deflate_set_costs_from_codes(c, &c->static_codes.lens);
 		deflate_find_min_cost_path(c, block_length, cache_ptr);
+		static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
+		static_cost += 7; /* for the end-of-block symbol */
 
-		/* Compute frequencies of the chosen symbols. */
-		deflate_reset_symbol_frequencies(c);
-		deflate_tally_item_list(c, block_length);
+		/* Restore c->p.n.costs. */
+		c->p.n.costs = c->p.n.costs_saved;
+	}
+
+	/* Initialize c->p.n.costs with default costs. */
+	deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
+
+	do {
+		/*
+		 * Find the minimum-cost path for this pass.
+		 * Also set c->freqs and c->codes to match the path.
+		 */
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
 
-		/* Make the Huffman codes. */
-		deflate_make_huffman_codes(&c->freqs, &c->codes);
+		/*
+		 * Compute the exact cost of the block if the path were to be
+		 * used.  Note that this differs from
+		 * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
+		 * the actual Huffman codes instead of c->p.n.costs.
+		 */
+		true_cost = deflate_compute_true_cost(c);
 
 		/*
-		 * Update the costs.  After the last optimization pass, the
-		 * final costs won't be needed for this block, but they will be
-		 * used in determining the initial costs for the next block.
+		 * If the cost didn't improve much from the previous pass, then
+		 * doing more passes probably won't be helpful, so stop early.
 		 */
-		if (--num_passes_remaining || !is_final_block)
+		if (true_cost + c->p.n.min_improvement_to_continue >
+		    best_true_cost)
+			break;
+
+		best_true_cost = true_cost;
+
+		/* Save the cost model that gave 'best_true_cost'. */
+		c->p.n.costs_saved = c->p.n.costs;
+
+		/* Update the cost model from the Huffman codes. */
+		deflate_set_costs_from_codes(c, &c->codes.lens);
+
+	} while (--num_passes_remaining);
+
+	*used_only_literals = false;
+	if (MIN(only_lits_cost, static_cost) < best_true_cost) {
+		if (only_lits_cost < static_cost) {
+			/* Using only literals ended up being best! */
+			deflate_choose_all_literals(c, block_begin, block_length);
 			deflate_set_costs_from_codes(c, &c->codes.lens);
-	} while (num_passes_remaining);
+			seq_.litrunlen_and_length = block_length;
+			seq = &seq_;
+			*used_only_literals = true;
+		} else {
+			/* Static block ended up being best! */
+			deflate_set_costs_from_codes(c, &c->static_codes.lens);
+			deflate_find_min_cost_path(c, block_length, cache_ptr);
+		}
+	} else if (true_cost >=
+		   best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
+		/*
+		 * The best solution was actually from a non-final optimization
+		 * pass, so recover and use the min-cost path from that pass.
+		 */
+		c->p.n.costs = c->p.n.costs_saved;
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
+		deflate_set_costs_from_codes(c, &c->codes.lens);
+	}
+	deflate_flush_block(c, os, block_begin, block_length, seq,
+			    is_final_block);
 }
 
 static void
@@ -3387,6 +3603,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 	unsigned nice_len = MIN(c->nice_match_length, max_len);
 	struct lz_match *cache_ptr = c->p.n.match_cache;
 	u32 next_hashes[2] = {0, 0};
+	bool prev_block_used_only_literals = false;
 
 	bt_matchfinder_init(&c->p.n.bt_mf);
 	deflate_near_optimal_init_stats(c);
@@ -3405,8 +3622,16 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 		 * literal/match statistics gathered during matchfinding.
 		 * However, the actual near-optimal parse won't respect min_len,
 		 * as it can accurately assess the costs of different matches.
+		 *
+		 * If the "use only literals" strategy happened to be the best
+		 * strategy on the previous block, then probably the
+		 * min_match_len heuristic is still not aggressive enough for
+		 * the data, so force gathering literal stats only.
 		 */
-		min_len = calculate_min_match_len(
+		if (prev_block_used_only_literals)
+			min_len = DEFLATE_MAX_MATCH_LEN + 1;
+		else
+			min_len = calculate_min_match_len(
 					in_block_begin,
 					in_max_block_end - in_block_begin,
 					c->max_search_depth);
@@ -3583,10 +3808,11 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			} while (--num_bytes_to_rewind);
 			cache_len_rewound = orig_cache_ptr - cache_ptr;
 
-			deflate_optimize_block(c, in_block_begin, block_length,
-					       cache_ptr, is_first, is_final);
-			deflate_flush_block(c, os, in_block_begin, block_length,
-					    NULL, is_final);
+			deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						block_length, cache_ptr,
+						is_first, is_final,
+						&prev_block_used_only_literals);
 			memmove(c->p.n.match_cache, cache_ptr,
 				cache_len_rewound * sizeof(*cache_ptr));
 			cache_ptr = &c->p.n.match_cache[cache_len_rewound];
@@ -3608,16 +3834,17 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			bool is_final = (in_next == in_end);
 
 			deflate_near_optimal_merge_stats(c);
-			deflate_optimize_block(c, in_block_begin, block_length,
-					       cache_ptr, is_first, is_final);
-			deflate_flush_block(c, os, in_block_begin, block_length,
-					    NULL, is_final);
+			deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						block_length, cache_ptr,
+						is_first, is_final,
+						&prev_block_used_only_literals);
 			cache_ptr = &c->p.n.match_cache[0];
 			deflate_near_optimal_save_stats(c);
 			deflate_near_optimal_init_stats(c);
 			in_block_begin = in_next;
 		}
-	} while (in_next != in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 /* Initialize c->p.n.offset_slot_full. */
@@ -3641,14 +3868,22 @@ deflate_init_offset_slot_full(struct libdeflate_compressor *c)
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
-LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
-libdeflate_alloc_compressor(int compression_level)
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor_ex(int compression_level,
+			       const struct libdeflate_options *options)
 {
 	struct libdeflate_compressor *c;
 	size_t size = offsetof(struct libdeflate_compressor, p);
 
 	check_buildtime_parameters();
 
+	/*
+	 * Note: if more fields are added to libdeflate_options, this code will
+	 * need to be updated to support both the old and new structs.
+	 */
+	if (options->sizeof_options != sizeof(*options))
+		return NULL;
+
 	if (compression_level < 0 || compression_level > 12)
 		return NULL;
 
@@ -3664,9 +3899,14 @@ libdeflate_alloc_compressor(int compression_level)
 			size += sizeof(c->p.f);
 	}
 
-	c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
+	c = libdeflate_aligned_malloc(options->malloc_func ?
+				      options->malloc_func :
+				      libdeflate_default_malloc_func,
+				      MATCHFINDER_MEM_ALIGNMENT, size);
 	if (!c)
 		return NULL;
+	c->free_func = options->free_func ?
+		       options->free_func : libdeflate_default_free_func;
 
 	c->compression_level = compression_level;
 
@@ -3734,22 +3974,31 @@ libdeflate_alloc_compressor(int compression_level)
 		c->impl = deflate_compress_near_optimal;
 		c->max_search_depth = 35;
 		c->nice_match_length = 75;
-		c->p.n.num_optim_passes = 2;
+		c->p.n.max_optim_passes = 2;
+		c->p.n.min_improvement_to_continue = 32;
+		c->p.n.min_bits_to_use_nonfinal_path = 32;
+		c->p.n.max_len_to_optimize_static_block = 0;
 		deflate_init_offset_slot_full(c);
 		break;
 	case 11:
 		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 70;
+		c->max_search_depth = 100;
 		c->nice_match_length = 150;
-		c->p.n.num_optim_passes = 3;
+		c->p.n.max_optim_passes = 4;
+		c->p.n.min_improvement_to_continue = 16;
+		c->p.n.min_bits_to_use_nonfinal_path = 16;
+		c->p.n.max_len_to_optimize_static_block = 1000;
 		deflate_init_offset_slot_full(c);
 		break;
 	case 12:
 	default:
 		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 150;
+		c->max_search_depth = 300;
 		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
-		c->p.n.num_optim_passes = 4;
+		c->p.n.max_optim_passes = 10;
+		c->p.n.min_improvement_to_continue = 1;
+		c->p.n.min_bits_to_use_nonfinal_path = 1;
+		c->p.n.max_len_to_optimize_static_block = 10000;
 		deflate_init_offset_slot_full(c);
 		break;
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
@@ -3760,7 +4009,17 @@ libdeflate_alloc_compressor(int compression_level)
 	return c;
 }
 
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor(int compression_level)
+{
+	static const struct libdeflate_options defaults = {
+		.sizeof_options = sizeof(defaults),
+	};
+	return libdeflate_alloc_compressor_ex(compression_level, &defaults);
+}
+
+LIBDEFLATEAPI size_t
 libdeflate_deflate_compress(struct libdeflate_compressor *c,
 			    const void *in, size_t in_nbytes,
 			    void *out, size_t out_nbytes_avail)
@@ -3775,38 +4034,40 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c,
 		return deflate_compress_none(in, in_nbytes,
 					     out, out_nbytes_avail);
 
-	/*
-	 * Initialize the output bitstream structure.
-	 *
-	 * The end is set to OUTPUT_END_PADDING below the true end, so that
-	 * FLUSH_BITS() can be more efficient.
-	 */
-	if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING))
-		return 0;
+	/* Initialize the output bitstream structure. */
 	os.bitbuf = 0;
 	os.bitcount = 0;
 	os.next = out;
-	os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING;
+	os.end = os.next + out_nbytes_avail;
+	os.overflow = false;
+
+	/* Call the actual compression function. */
 	(*c->impl)(c, in, in_nbytes, &os);
+
+	/* Return 0 if the output buffer is too small. */
+	if (os.overflow)
+		return 0;
+
 	/*
-	 * If 'os.next' reached 'os.end', then either there was not enough space
-	 * in the output buffer, or the compressed size would have been within
-	 * OUTPUT_END_PADDING of the true end.  For performance reasons we don't
-	 * distinguish between these cases; we just make sure to return some
-	 * extra space from libdeflate_deflate_compress_bound().
+	 * Write the final byte if needed.  This can't overflow the output
+	 * buffer because deflate_flush_block() would have set the overflow flag
+	 * if there wasn't enough space remaining for the full final block.
 	 */
-	if (os.next >= os.end)
-		return 0;
 	ASSERT(os.bitcount <= 7);
-	if (os.bitcount)
+	if (os.bitcount) {
+		ASSERT(os.next < os.end);
 		*os.next++ = os.bitbuf;
+	}
+
+	/* Return the compressed size in bytes. */
 	return os.next - (u8 *)out;
 }
 
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
+LIBDEFLATEAPI void
 libdeflate_free_compressor(struct libdeflate_compressor *c)
 {
-	libdeflate_aligned_free(c);
+	if (c)
+		libdeflate_aligned_free(c->free_func, c);
 }
 
 unsigned int
@@ -3815,11 +4076,10 @@ libdeflate_get_compression_level(struct libdeflate_compressor *c)
 	return c->compression_level;
 }
 
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_deflate_compress_bound(__attribute__((unused)) struct libdeflate_compressor *c,
 				  size_t in_nbytes)
 {
-	size_t bound = 0;
 	size_t max_blocks;
 
 	/*
@@ -3836,10 +4096,12 @@ libdeflate_deflate_compress_bound(__attribute__((unused)) struct libdeflate_comp
 	 */
 
 	/*
-	 * The minimum length that is passed to deflate_flush_block() is
-	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.
+	 * Calculate the maximum number of uncompressed blocks that the
+	 * compressor can use for 'in_nbytes' of data.
 	 *
-	 * If deflate_flush_block() decides to use an uncompressed block, it
+	 * The minimum length that is passed to deflate_flush_block() is
+	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
+	 * deflate_flush_block() decides to use an uncompressed block, it
 	 * actually will (in general) output a series of uncompressed blocks in
 	 * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
 	 * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
@@ -3858,20 +4120,8 @@ libdeflate_deflate_compress_bound(__attribute__((unused)) struct libdeflate_comp
 	 * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
 	 * alignment bits at the very start of the block can be disregarded;
 	 * they would otherwise increase the overhead to 6 bytes per block.)
+	 * Therefore, the maximum number of overhead bytes is '5 * max_blocks'.
+	 * To get the final bound, add the number of uncompressed bytes.
 	 */
-	bound += 5 * max_blocks;
-
-	/* Account for the data itself, stored uncompressed. */
-	bound += in_nbytes;
-
-	/*
-	 * Add 1 + OUTPUT_END_PADDING because for performance reasons, the
-	 * compressor doesn't distinguish between cases where there wasn't
-	 * enough space and cases where the compressed size would have been
-	 * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater.  Adding
-	 * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room.
-	 */
-	bound += 1 + OUTPUT_END_PADDING;
-
-	return bound;
+	return (5 * max_blocks) + in_nbytes;
 }
diff --git a/2.0/libdeflate/lib/deflate_decompress.c b/2.0/libdeflate/lib/deflate_decompress.c
index d7d79e465..25d225239 100644
--- a/2.0/libdeflate/lib/deflate_decompress.c
+++ b/2.0/libdeflate/lib/deflate_decompress.c
@@ -42,13 +42,9 @@
  *   instructions enabled and is used automatically at runtime when supported.
  */
 
-#include <limits.h>
-
 #include "lib_common.h"
 #include "deflate_constants.h"
 
-#include "libdeflate.h"
-
 /*
  * If the expression passed to SAFETY_CHECK() evaluates to false, then the
  * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
@@ -675,6 +671,9 @@ struct libdeflate_decompressor {
 
 	bool static_codes_loaded;
 	unsigned litlen_tablebits;
+
+	/* The free() function for this struct, chosen at allocation time */
+	free_func_t free_func;
 };
 
 /*
@@ -806,38 +805,48 @@ build_decode_table(u32 decode_table[],
 		u32 entry;
 		unsigned i;
 
+		/*
+		 * The DEFLATE RFC explicitly allows the offset code to be
+		 * incomplete in two cases: a code containing just 1 codeword,
+		 * if that codeword has length 1; and a code containing no
+		 * codewords.  Note: the list of offset codeword lengths is
+		 * always nonempty, but lengths of 0 don't count as codewords.
+		 *
+		 * The RFC doesn't say whether the same cases are allowed for
+		 * the litlen and pre codes.  It's actually impossible for no
+		 * symbols to be used from these codes; however, it's
+		 * technically possible for only one symbol to be used.  zlib
+		 * allows 1 codeword for the litlen code, but not the precode.
+		 * The RFC also doesn't say whether, when there is 1 codeword,
+		 * that codeword is '0' or '1'.  zlib uses '0'.
+		 *
+		 * We accept what zlib accepts, plus a bit more.  First, we
+		 * don't treat the precode more strictly than the litlen and
+		 * offset codes.  There's no convincing reason to add a special
+		 * case for the precode here.
+		 *
+		 * Second, we just map each allowed incompete code to a complete
+		 * code with only real symbols.  To do this, we choose a symbol,
+		 * either the used symbol (for codes with 1 codeword) or an
+		 * arbitrary symbol (for empty codes), and give it both
+		 * codewords '0' and '1'.  zlib instead uses a special ERROR
+		 * symbol in the part of the codespace the code doesn't use.
+		 * However, having an ERROR symbol reduces the performance of
+		 * the Huffman decoder, for no real benefit.  Our approach also
+		 * avoids having to decide whether '0' or '1' is correct.
+		 *
+		 * Like zlib, we still reject all incomplete codes that contain
+		 * more than 1 codeword or a codeword length greater than 1.
+		 */
 		if (codespace_used == 0) {
-			/*
-			 * An empty code is allowed.  This can happen for the
-			 * offset code in DEFLATE, since a dynamic Huffman block
-			 * need not contain any matches.
-			 */
-
-			/* sym=0, len=1 (arbitrary) */
-			entry = make_decode_table_entry(decode_results, 0, 1);
+			sym = 0; /* arbitrary */
 		} else {
-			/*
-			 * Allow codes with a single used symbol, with codeword
-			 * length 1.  The DEFLATE RFC is unclear regarding this
-			 * case.  What zlib's decompressor does is permit this
-			 * for the litlen and offset codes and assume the
-			 * codeword is '0' rather than '1'.  We do the same
-			 * except we allow this for precodes too, since there's
-			 * no convincing reason to treat the codes differently.
-			 * We also assign both codewords '0' and '1' to the
-			 * symbol to avoid having to handle '1' specially.
-			 */
 			if (codespace_used != (1U << (max_codeword_len - 1)) ||
 			    len_counts[1] != 1)
 				return false;
-			entry = make_decode_table_entry(decode_results,
-							*sorted_syms, 1);
+			sym = sorted_syms[0];
 		}
-		/*
-		 * Note: the decode table still must be fully initialized, in
-		 * case the stream is malformed and contains bits from the part
-		 * of the codespace the incomplete code doesn't use.
-		 */
+		entry = make_decode_table_entry(decode_results, sym, 1);
 		for (i = 0; i < (1U << table_bits); i++)
 			decode_table[i] = entry;
 		return true;
@@ -1075,7 +1084,7 @@ typedef enum libdeflate_result (*decompress_func_t)
 /* Include architecture-specific implementation(s) if available. */
 #undef DEFAULT_IMPL
 #undef arch_select_decompress_func
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #  include "x86/decompress_impl.h"
 #endif
 
@@ -1121,7 +1130,7 @@ dispatch_decomp(struct libdeflate_decompressor *d,
  * handles calling the appropriate implementation depending on the CPU features
  * at runtime.
  */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
 				 const void *in, size_t in_nbytes,
 				 void *out, size_t out_nbytes_avail,
@@ -1132,7 +1141,7 @@ libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
 			       actual_in_nbytes_ret, actual_out_nbytes_ret);
 }
 
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
 			      const void *in, size_t in_nbytes,
 			      void *out, size_t out_nbytes_avail,
@@ -1143,9 +1152,22 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
 						NULL, actual_out_nbytes_ret);
 }
 
-LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI
-libdeflate_alloc_decompressor(void)
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options)
 {
+	struct libdeflate_decompressor *d;
+
+	/*
+	 * Note: if more fields are added to libdeflate_options, this code will
+	 * need to be updated to support both the old and new structs.
+	 */
+	if (options->sizeof_options != sizeof(*options))
+		return NULL;
+
+	d = (options->malloc_func ? options->malloc_func :
+	     libdeflate_default_malloc_func)(sizeof(*d));
+	if (d == NULL)
+		return NULL;
 	/*
 	 * Note that only certain parts of the decompressor actually must be
 	 * initialized here:
@@ -1159,18 +1181,28 @@ libdeflate_alloc_decompressor(void)
 	 *   valgrind, since build_decode_table() is guaranteed to initialize
 	 *   all entries eventually anyway.)
 	 *
+	 * - 'free_func' must be set.
+	 *
 	 * But for simplicity, we currently just zero the whole decompressor.
 	 */
-	struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d));
-
-	if (d == NULL)
-		return NULL;
 	memset(d, 0, sizeof(*d));
+	d->free_func = options->free_func ?
+		       options->free_func : libdeflate_default_free_func;
 	return d;
 }
 
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void)
+{
+	static const struct libdeflate_options defaults = {
+		.sizeof_options = sizeof(defaults),
+	};
+	return libdeflate_alloc_decompressor_ex(&defaults);
+}
+
+LIBDEFLATEAPI void
 libdeflate_free_decompressor(struct libdeflate_decompressor *d)
 {
-	libdeflate_free(d);
+	if (d)
+		d->free_func(d);
 }
diff --git a/2.0/libdeflate/lib/gzip_compress.c b/2.0/libdeflate/lib/gzip_compress.c
index 124375291..b7d5076e2 100644
--- a/2.0/libdeflate/lib/gzip_compress.c
+++ b/2.0/libdeflate/lib/gzip_compress.c
@@ -28,9 +28,7 @@
 #include "deflate_compress.h"
 #include "gzip_constants.h"
 
-#include "libdeflate.h"
-
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_gzip_compress(struct libdeflate_compressor *c,
 			 const void *in, size_t in_nbytes,
 			 void *out, size_t out_nbytes_avail)
@@ -83,7 +81,7 @@ libdeflate_gzip_compress(struct libdeflate_compressor *c,
 	return out_next - (u8 *)out;
 }
 
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_gzip_compress_bound(struct libdeflate_compressor *c,
 			       size_t in_nbytes)
 {
diff --git a/2.0/libdeflate/lib/gzip_decompress.c b/2.0/libdeflate/lib/gzip_decompress.c
index eb5c44335..30adc1769 100644
--- a/2.0/libdeflate/lib/gzip_decompress.c
+++ b/2.0/libdeflate/lib/gzip_decompress.c
@@ -28,9 +28,7 @@
 #include "lib_common.h"
 #include "gzip_constants.h"
 
-#include "libdeflate.h"
-
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
 			      const void *in, size_t in_nbytes,
 			      void *out, size_t out_nbytes_avail,
@@ -134,7 +132,7 @@ libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
 	return LIBDEFLATE_SUCCESS;
 }
 
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
 			   const void *in, size_t in_nbytes,
 			   void *out, size_t out_nbytes_avail,
diff --git a/2.0/libdeflate/lib/hc_matchfinder.h b/2.0/libdeflate/lib/hc_matchfinder.h
index a690c0ae3..e9e2090a8 100644
--- a/2.0/libdeflate/lib/hc_matchfinder.h
+++ b/2.0/libdeflate/lib/hc_matchfinder.h
@@ -116,7 +116,7 @@
 	(((1UL << HC_MATCHFINDER_HASH3_ORDER) +		\
 	  (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
 
-struct hc_matchfinder {
+struct MATCHFINDER_ALIGNED hc_matchfinder  {
 
 	/* The hash table for finding length 3 matches  */
 	mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
@@ -128,8 +128,7 @@ struct hc_matchfinder {
 	/* The "next node" references for the linked lists.  The "next node" of
 	 * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
 	mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
-
-} MATCHFINDER_ALIGNED;
+};
 
 /* Prepare the matchfinder for a new input buffer.  */
 static forceinline void
diff --git a/2.0/libdeflate/lib/ht_matchfinder.h b/2.0/libdeflate/lib/ht_matchfinder.h
index 898e99394..7588073da 100644
--- a/2.0/libdeflate/lib/ht_matchfinder.h
+++ b/2.0/libdeflate/lib/ht_matchfinder.h
@@ -54,10 +54,10 @@
 /* Minimum value of max_len for ht_matchfinder_longest_match() */
 #define HT_MATCHFINDER_REQUIRED_NBYTES	5
 
-struct ht_matchfinder {
+struct MATCHFINDER_ALIGNED ht_matchfinder {
 	mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER]
 			 [HT_MATCHFINDER_BUCKET_SIZE];
-} MATCHFINDER_ALIGNED;
+};
 
 static forceinline void
 ht_matchfinder_init(struct ht_matchfinder *mf)
diff --git a/2.0/libdeflate/lib/lib_common.h b/2.0/libdeflate/lib/lib_common.h
index 10d0160ac..8c9ff5fe0 100644
--- a/2.0/libdeflate/lib/lib_common.h
+++ b/2.0/libdeflate/lib/lib_common.h
@@ -6,19 +6,48 @@
 #define LIB_LIB_COMMON_H
 
 #ifdef LIBDEFLATE_H
+ /*
+  * When building the library, LIBDEFLATEAPI needs to be defined properly before
+  * including libdeflate.h.
+  */
 #  error "lib_common.h must always be included before libdeflate.h"
-   /* because BUILDING_LIBDEFLATE must be set first */
 #endif
 
-#define BUILDING_LIBDEFLATE
+#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#  define LIBDEFLATE_EXPORT_SYM  __declspec(dllexport)
+#elif defined(__GNUC__)
+#  define LIBDEFLATE_EXPORT_SYM  __attribute__((visibility("default")))
+#else
+#  define LIBDEFLATE_EXPORT_SYM
+#endif
+
+/*
+ * On i386, gcc assumes that the stack is 16-byte aligned at function entry.
+ * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi)
+ * only guarantee 4-byte alignment when calling functions.  This is mainly an
+ * issue on Windows, but it has been seen on Linux too.  Work around this ABI
+ * incompatibility by realigning the stack pointer when entering libdeflate.
+ * This prevents crashes in SSE/AVX code.
+ */
+#if defined(__GNUC__) && defined(__i386__)
+#  define LIBDEFLATE_ALIGN_STACK  __attribute__((force_align_arg_pointer))
+#else
+#  define LIBDEFLATE_ALIGN_STACK
+#endif
+
+#define LIBDEFLATEAPI	LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK
 
 #include "../common_defs.h"
 
-void *libdeflate_malloc(size_t size);
-void libdeflate_free(void *ptr);
+typedef void *(*malloc_func_t)(size_t);
+typedef void (*free_func_t)(void *);
+
+extern malloc_func_t libdeflate_default_malloc_func;
+extern free_func_t libdeflate_default_free_func;
 
-void *libdeflate_aligned_malloc(size_t alignment, size_t size);
-void libdeflate_aligned_free(void *ptr);
+void *libdeflate_aligned_malloc(malloc_func_t malloc_func,
+				size_t alignment, size_t size);
+void libdeflate_aligned_free(free_func_t free_func, void *ptr);
 
 #ifdef FREESTANDING
 /*
diff --git a/2.0/libdeflate/lib/matchfinder_common.h b/2.0/libdeflate/lib/matchfinder_common.h
index e0e090530..48a243e1d 100644
--- a/2.0/libdeflate/lib/matchfinder_common.h
+++ b/2.0/libdeflate/lib/matchfinder_common.h
@@ -61,9 +61,9 @@ typedef s16 mf_pos_t;
 #undef matchfinder_rebase
 #ifdef _aligned_attribute
 #  define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
-#  if defined(__arm__) || defined(__aarch64__)
+#  if defined(ARCH_ARM32) || defined(ARCH_ARM64)
 #    include "arm/matchfinder_impl.h"
-#  elif defined(__i386__) || defined(__x86_64__)
+#  elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #    include "x86/matchfinder_impl.h"
 #  endif
 #else
diff --git a/2.0/libdeflate/lib/unaligned.h b/2.0/libdeflate/lib/unaligned.h
deleted file mode 100644
index bb48bf828..000000000
--- a/2.0/libdeflate/lib/unaligned.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * unaligned.h - inline functions for unaligned memory accesses
- */
-
-#ifndef LIB_UNALIGNED_H
-#define LIB_UNALIGNED_H
-
-#include "lib_common.h"
-
-/***** Unaligned loads and stores without endianness conversion *****/
-
-/*
- * memcpy() is portable, and it usually gets optimized appropriately by modern
- * compilers.  I.e., each memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled
- * to a load or store instruction, not to an actual function call.
- *
- * We no longer use the "packed struct" approach, as that is nonstandard, has
- * unclear semantics, and doesn't receive enough testing
- * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
- *
- * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
- * where memcpy() generates inefficient code
- * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366).  However, we no longer
- * consider that one case important enough to maintain different code for.
- * If you run into it, please just use a newer version of gcc (or use clang).
- */
-
-#define DEFINE_UNALIGNED_TYPE(type)				\
-static forceinline type						\
-load_##type##_unaligned(const void *p)				\
-{								\
-	type v;							\
-	memcpy(&v, p, sizeof(v));				\
-	return v;						\
-}								\
-								\
-static forceinline void						\
-store_##type##_unaligned(type v, void *p)			\
-{								\
-	memcpy(p, &v, sizeof(v));				\
-}
-
-DEFINE_UNALIGNED_TYPE(u16)
-DEFINE_UNALIGNED_TYPE(u32)
-DEFINE_UNALIGNED_TYPE(u64)
-DEFINE_UNALIGNED_TYPE(machine_word_t)
-
-#define load_word_unaligned	load_machine_word_t_unaligned
-#define store_word_unaligned	store_machine_word_t_unaligned
-
-/***** Unaligned loads with endianness conversion *****/
-
-static forceinline u16
-get_unaligned_le16(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return le16_bswap(load_u16_unaligned(p));
-	else
-		return ((u16)p[1] << 8) | p[0];
-}
-
-static forceinline u16
-get_unaligned_be16(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return be16_bswap(load_u16_unaligned(p));
-	else
-		return ((u16)p[0] << 8) | p[1];
-}
-
-static forceinline u32
-get_unaligned_le32(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return le32_bswap(load_u32_unaligned(p));
-	else
-		return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
-			((u32)p[1] << 8) | p[0];
-}
-
-static forceinline u32
-get_unaligned_be32(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return be32_bswap(load_u32_unaligned(p));
-	else
-		return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
-			((u32)p[2] << 8) | p[3];
-}
-
-static forceinline u64
-get_unaligned_le64(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return le64_bswap(load_u64_unaligned(p));
-	else
-		return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
-			((u64)p[5] << 40) | ((u64)p[4] << 32) |
-			((u64)p[3] << 24) | ((u64)p[2] << 16) |
-			((u64)p[1] << 8) | p[0];
-}
-
-static forceinline machine_word_t
-get_unaligned_leword(const u8 *p)
-{
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-	if (WORDBITS == 32)
-		return get_unaligned_le32(p);
-	else
-		return get_unaligned_le64(p);
-}
-
-/***** Unaligned stores with endianness conversion *****/
-
-static forceinline void
-put_unaligned_le16(u16 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u16_unaligned(le16_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 0);
-		p[1] = (u8)(v >> 8);
-	}
-}
-
-static forceinline void
-put_unaligned_be16(u16 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u16_unaligned(be16_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 8);
-		p[1] = (u8)(v >> 0);
-	}
-}
-
-static forceinline void
-put_unaligned_le32(u32 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u32_unaligned(le32_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 0);
-		p[1] = (u8)(v >> 8);
-		p[2] = (u8)(v >> 16);
-		p[3] = (u8)(v >> 24);
-	}
-}
-
-static forceinline void
-put_unaligned_be32(u32 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u32_unaligned(be32_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 24);
-		p[1] = (u8)(v >> 16);
-		p[2] = (u8)(v >> 8);
-		p[3] = (u8)(v >> 0);
-	}
-}
-
-static forceinline void
-put_unaligned_le64(u64 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u64_unaligned(le64_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 0);
-		p[1] = (u8)(v >> 8);
-		p[2] = (u8)(v >> 16);
-		p[3] = (u8)(v >> 24);
-		p[4] = (u8)(v >> 32);
-		p[5] = (u8)(v >> 40);
-		p[6] = (u8)(v >> 48);
-		p[7] = (u8)(v >> 56);
-	}
-}
-
-static forceinline void
-put_unaligned_leword(machine_word_t v, u8 *p)
-{
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-	if (WORDBITS == 32)
-		put_unaligned_le32(v, p);
-	else
-		put_unaligned_le64(v, p);
-}
-
-/***** 24-bit loads *****/
-
-/*
- * Given a 32-bit value that was loaded with the platform's native endianness,
- * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
- * bits contain the first 3 bytes, arranged in octets in a platform-dependent
- * order, at the memory location from which the input 32-bit value was loaded.
- */
-static forceinline u32
-loaded_u32_to_u24(u32 v)
-{
-	if (CPU_IS_LITTLE_ENDIAN())
-		return v & 0xFFFFFF;
-	else
-		return v >> 8;
-}
-
-/*
- * Load the next 3 bytes from the memory location @p into the 24 low-order bits
- * of a 32-bit value.  The order in which the 3 bytes will be arranged as octets
- * in the 24 bits is platform-dependent.  At least LOAD_U24_REQUIRED_NBYTES
- * bytes must be available at @p; note that this may be more than 3.
- */
-static forceinline u32
-load_u24_unaligned(const u8 *p)
-{
-#if UNALIGNED_ACCESS_IS_FAST
-#  define LOAD_U24_REQUIRED_NBYTES 4
-	return loaded_u32_to_u24(load_u32_unaligned(p));
-#else
-#  define LOAD_U24_REQUIRED_NBYTES 3
-	if (CPU_IS_LITTLE_ENDIAN())
-		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
-	else
-		return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
-#endif
-}
-
-#endif /* LIB_UNALIGNED_H */
diff --git a/2.0/libdeflate/lib/utils.c b/2.0/libdeflate/lib/utils.c
index 46826ef48..c1e4cc26f 100644
--- a/2.0/libdeflate/lib/utils.c
+++ b/2.0/libdeflate/lib/utils.c
@@ -27,8 +27,6 @@
 
 #include "lib_common.h"
 
-#include "libdeflate.h"
-
 #ifdef FREESTANDING
 #  define malloc NULL
 #  define free NULL
@@ -36,27 +34,18 @@
 #  include <stdlib.h>
 #endif
 
-static void *(*libdeflate_malloc_func)(size_t) = malloc;
-static void (*libdeflate_free_func)(void *) = free;
+malloc_func_t libdeflate_default_malloc_func = malloc;
+free_func_t libdeflate_default_free_func = free;
 
 void *
-libdeflate_malloc(size_t size)
+libdeflate_aligned_malloc(malloc_func_t malloc_func,
+			  size_t alignment, size_t size)
 {
-	return (*libdeflate_malloc_func)(size);
-}
+	void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size);
 
-void
-libdeflate_free(void *ptr)
-{
-	(*libdeflate_free_func)(ptr);
-}
-
-void *
-libdeflate_aligned_malloc(size_t alignment, size_t size)
-{
-	void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size);
 	if (ptr) {
 		void *orig_ptr = ptr;
+
 		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
 		((void **)ptr)[-1] = orig_ptr;
 	}
@@ -64,18 +53,17 @@ libdeflate_aligned_malloc(size_t alignment, size_t size)
 }
 
 void
-libdeflate_aligned_free(void *ptr)
+libdeflate_aligned_free(free_func_t free_func, void *ptr)
 {
-	if (ptr)
-		libdeflate_free(((void **)ptr)[-1]);
+	(*free_func)(((void **)ptr)[-1]);
 }
 
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
-libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
-				void (*free_func)(void *))
+LIBDEFLATEAPI void
+libdeflate_set_memory_allocator(malloc_func_t malloc_func,
+				free_func_t free_func)
 {
-	libdeflate_malloc_func = malloc_func;
-	libdeflate_free_func = free_func;
+	libdeflate_default_malloc_func = malloc_func;
+	libdeflate_default_free_func = free_func;
 }
 
 /*
diff --git a/2.0/libdeflate/lib/x86/adler32_impl.h b/2.0/libdeflate/lib/x86/adler32_impl.h
index 52a0c5b04..6285dc80a 100644
--- a/2.0/libdeflate/lib/x86/adler32_impl.h
+++ b/2.0/libdeflate/lib/x86/adler32_impl.h
@@ -43,30 +43,46 @@
 
 #define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2)		    \
 {									    \
-	__v4su s1_last = (v_s1), s2_last = (v_s2);			    \
+	__m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2);	    \
 									    \
 	/* 128 => 32 bits */						    \
-	s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31);	    \
-	s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02);	    \
-	s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02);	    \
+	s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \
+	s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \
+	s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \
 									    \
-	*(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last);		    \
-	*(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last);		    \
+	*(s1) += (u32)_mm_cvtsi128_si32(s1_last);			    \
+	*(s2) += (u32)_mm_cvtsi128_si32(s2_last);			    \
 }
 
 #define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2)		    \
 {									    \
-	__v4su s1_128bit, s2_128bit;					    \
+	__m128i /* __v4su */ s1_128bit, s2_128bit;			    \
 									    \
 	/* 256 => 128 bits */						    \
-	s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) +  \
-		    (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1);   \
-	s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) +  \
-		    (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1);   \
+	s1_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s1), 0),	    \
+				  _mm256_extracti128_si256((v_s1), 1));	    \
+	s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0),	    \
+				  _mm256_extracti128_si256((v_s2), 1));	    \
 									    \
 	ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit);	    \
 }
 
+/*
+ * This is a very silly partial workaround for gcc bug
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892.  The bug causes gcc to
+ * generate extra move instructions in some loops containing vector intrinsics.
+ *
+ * An alternate workaround would be to use gcc native vector operations instead
+ * of vector intrinsics.  But that would result in MSVC needing its own code.
+ */
+#if GCC_PREREQ(1, 0)
+#  define GCC_UPDATE_VARS(a, b, c, d, e, f) \
+	__asm__("" : "+x" (a), "+x" (b), "+x" (c), "+x" (d), "+x" (e), "+x" (f))
+#else
+#  define GCC_UPDATE_VARS(a, b, c, d, e, f) \
+	(void)a, (void)b, (void)c, (void)d, (void)e, (void)f
+#endif
+
 /* SSE2 implementation */
 #if HAVE_SSE2_INTRIN
 #  define adler32_sse2		adler32_sse2
@@ -83,29 +99,37 @@
 #  if HAVE_SSE2_NATIVE
 #    define ATTRIBUTES
 #  else
-#    define ATTRIBUTES		__attribute__((target("sse2")))
+#    define ATTRIBUTES		_target_attribute("sse2")
 #  endif
 #  include <emmintrin.h>
 static forceinline ATTRIBUTES void
 adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 {
 	const __m128i zeroes = _mm_setzero_si128();
+	const __m128i /* __v8hu */ mults_a =
+		_mm_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25);
+	const __m128i /* __v8hu */ mults_b =
+		_mm_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17);
+	const __m128i /* __v8hu */ mults_c =
+		_mm_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9);
+	const __m128i /* __v8hu */ mults_d =
+		_mm_setr_epi16(8,  7,  6,  5,  4,  3,  2,  1);
 
 	/* s1 counters: 32-bit, sum of bytes */
-	__v4su v_s1 = (__v4su)zeroes;
+	__m128i /* __v4su */ v_s1 = zeroes;
 
 	/* s2 counters: 32-bit, sum of s1 values */
-	__v4su v_s2 = (__v4su)zeroes;
+	__m128i /* __v4su */ v_s2 = zeroes;
 
 	/*
 	 * Thirty-two 16-bit counters for byte sums.  Each accumulates the bytes
 	 * that eventually need to be multiplied by a number 32...1 for addition
 	 * into s2.
 	 */
-	__v8hu v_byte_sums_a = (__v8hu)zeroes;
-	__v8hu v_byte_sums_b = (__v8hu)zeroes;
-	__v8hu v_byte_sums_c = (__v8hu)zeroes;
-	__v8hu v_byte_sums_d = (__v8hu)zeroes;
+	__m128i /* __v8hu */ v_byte_sums_a = zeroes;
+	__m128i /* __v8hu */ v_byte_sums_b = zeroes;
+	__m128i /* __v8hu */ v_byte_sums_c = zeroes;
+	__m128i /* __v8hu */ v_byte_sums_d = zeroes;
 
 	do {
 		/* Load the next 32 bytes. */
@@ -117,37 +141,39 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 		 * Logically, this really should be v_s2 += v_s1 * 32, but we
 		 * can do the multiplication (or left shift) later.
 		 */
-		v_s2 += v_s1;
+		v_s2 = _mm_add_epi32(v_s2, v_s1);
 
 		/*
 		 * s1 update: use "Packed Sum of Absolute Differences" to add
 		 * the bytes horizontally with 8 bytes per sum.  Then add the
 		 * sums to the s1 counters.
 		 */
-		v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes);
-		v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes);
+		v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zeroes));
+		v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zeroes));
 
 		/*
 		 * Also accumulate the bytes into 32 separate counters that have
 		 * 16-bit precision.
 		 */
-		v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes);
-		v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes);
-		v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes);
-		v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes);
+		v_byte_sums_a = _mm_add_epi16(
+			v_byte_sums_a, _mm_unpacklo_epi8(bytes1, zeroes));
+		v_byte_sums_b = _mm_add_epi16(
+			v_byte_sums_b, _mm_unpackhi_epi8(bytes1, zeroes));
+		v_byte_sums_c = _mm_add_epi16(
+			v_byte_sums_c, _mm_unpacklo_epi8(bytes2, zeroes));
+		v_byte_sums_d = _mm_add_epi16(
+			v_byte_sums_d, _mm_unpackhi_epi8(bytes2, zeroes));
 
+		GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
+				v_byte_sums_c, v_byte_sums_d);
 	} while (p != end);
 
 	/* Finish calculating the s2 counters. */
-	v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5);
-	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a,
-				       (__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 });
-	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b,
-				       (__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 });
-	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c,
-				       (__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 });
-	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d,
-				       (__m128i)(__v8hu){ 8,  7,  6,  5,  4,  3,  2,  1 });
+	v_s2 = _mm_slli_epi32(v_s2, 5);
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_a, mults_a));
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_b, mults_b));
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_c, mults_c));
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d));
 
 	/* Add the counters to the real s1 and s2. */
 	ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
@@ -169,9 +195,17 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 #  if HAVE_AVX2_NATIVE
 #    define ATTRIBUTES
 #  else
-#    define ATTRIBUTES		__attribute__((target("avx2")))
+#    define ATTRIBUTES		_target_attribute("avx2")
 #  endif
 #  include <immintrin.h>
+  /*
+   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
+   * including some sub-headers.
+   */
+#  if defined(__clang__) && defined(_MSC_VER)
+#    include <avxintrin.h>
+#    include <avx2intrin.h>
+#  endif
 static forceinline ATTRIBUTES void
 adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 {
@@ -180,43 +214,50 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 	 * Note, the multipliers have to be in this order because
 	 * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately.
 	 */
-	const __v16hu mults_a = { 64, 63, 62, 61, 60, 59, 58, 57,
-				  48, 47, 46, 45, 44, 43, 42, 41, };
-	const __v16hu mults_b = { 56, 55, 54, 53, 52, 51, 50, 49,
-				  40, 39, 38, 37, 36, 35, 34, 33, };
-	const __v16hu mults_c = { 32, 31, 30, 29, 28, 27, 26, 25,
-				  16, 15, 14, 13, 12, 11, 10,  9, };
-	const __v16hu mults_d = { 24, 23, 22, 21, 20, 19, 18, 17,
-				  8,  7,  6,  5,  4,  3,  2,  1, };
-	__v8su v_s1 = (__v8su)zeroes;
-	__v8su v_s2 = (__v8su)zeroes;
-	__v16hu v_byte_sums_a = (__v16hu)zeroes;
-	__v16hu v_byte_sums_b = (__v16hu)zeroes;
-	__v16hu v_byte_sums_c = (__v16hu)zeroes;
-	__v16hu v_byte_sums_d = (__v16hu)zeroes;
+	const __m256i /* __v16hu */ mults_a =
+		_mm256_setr_epi16(64, 63, 62, 61, 60, 59, 58, 57,
+				  48, 47, 46, 45, 44, 43, 42, 41);
+	const __m256i /* __v16hu */ mults_b =
+		_mm256_setr_epi16(56, 55, 54, 53, 52, 51, 50, 49,
+				  40, 39, 38, 37, 36, 35, 34, 33);
+	const __m256i /* __v16hu */ mults_c =
+		_mm256_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25,
+				  16, 15, 14, 13, 12, 11, 10,  9);
+	const __m256i /* __v16hu */ mults_d =
+		_mm256_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17,
+				  8,  7,  6,  5,  4,  3,  2,  1);
+	__m256i /* __v8su */ v_s1 = zeroes;
+	__m256i /* __v8su */ v_s2 = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_a = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_b = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_c = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_d = zeroes;
 
 	do {
 		const __m256i bytes1 = *p++;
 		const __m256i bytes2 = *p++;
 
-		v_s2 += v_s1;
-		v_s1 += (__v8su)_mm256_sad_epu8(bytes1, zeroes);
-		v_s1 += (__v8su)_mm256_sad_epu8(bytes2, zeroes);
-		v_byte_sums_a += (__v16hu)_mm256_unpacklo_epi8(bytes1, zeroes);
-		v_byte_sums_b += (__v16hu)_mm256_unpackhi_epi8(bytes1, zeroes);
-		v_byte_sums_c += (__v16hu)_mm256_unpacklo_epi8(bytes2, zeroes);
-		v_byte_sums_d += (__v16hu)_mm256_unpackhi_epi8(bytes2, zeroes);
+		v_s2 = _mm256_add_epi32(v_s2, v_s1);
+		v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes1, zeroes));
+		v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes2, zeroes));
+		v_byte_sums_a = _mm256_add_epi16(
+			v_byte_sums_a, _mm256_unpacklo_epi8(bytes1, zeroes));
+		v_byte_sums_b = _mm256_add_epi16(
+			v_byte_sums_b, _mm256_unpackhi_epi8(bytes1, zeroes));
+		v_byte_sums_c = _mm256_add_epi16(
+			v_byte_sums_c, _mm256_unpacklo_epi8(bytes2, zeroes));
+		v_byte_sums_d = _mm256_add_epi16(
+			v_byte_sums_d, _mm256_unpackhi_epi8(bytes2, zeroes));
+
+		GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
+				v_byte_sums_c, v_byte_sums_d);
 	} while (p != end);
 
-	v_s2 = (__v8su)_mm256_slli_epi32((__m256i)v_s2, 6);
-	v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_a,
-					  (__m256i)mults_a);
-	v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_b,
-					  (__m256i)mults_b);
-	v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_c,
-					  (__m256i)mults_c);
-	v_s2 += (__v8su)_mm256_madd_epi16((__m256i)v_byte_sums_d,
-					  (__m256i)mults_d);
+	v_s2 = _mm256_slli_epi32(v_s2, 6);
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_a, mults_a));
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b));
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c));
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d));
 	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
 }
 #  include "../adler32_vec_template.h"
diff --git a/2.0/libdeflate/lib/x86/cpu_features.h b/2.0/libdeflate/lib/x86/cpu_features.h
index 796cfa8d9..ad14e435a 100644
--- a/2.0/libdeflate/lib/x86/cpu_features.h
+++ b/2.0/libdeflate/lib/x86/cpu_features.h
@@ -32,9 +32,9 @@
 
 #define HAVE_DYNAMIC_X86_CPU_FEATURES	0
 
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
 
-#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
+#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)
 #  undef HAVE_DYNAMIC_X86_CPU_FEATURES
 #  define HAVE_DYNAMIC_X86_CPU_FEATURES	1
 #endif
@@ -73,55 +73,36 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
  * functions.  Unfortunately clang has no feature test macro for this, so we
  * have to check its version.
  */
-#define HAVE_TARGET_INTRINSICS \
-	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
-	 (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)))
-
-/*
- * Before gcc 5.1 and clang 3.9, emmintrin.h only defined vectors of signed
- * integers (e.g. __v4si), not vectors of unsigned integers (e.g. __v4su).  We
- * need the unsigned ones to avoid signed integer overflow, which is undefined
- * behavior.  Add the missing definitions for the unsigned ones if needed.
- */
-#if (GCC_PREREQ(4, 0) && !GCC_PREREQ(5, 1)) || \
-	(defined(__clang__) && !CLANG_PREREQ(3, 9, 8020000)) || \
-	defined(__INTEL_COMPILER)
-typedef unsigned long long  __v2du __attribute__((__vector_size__(16)));
-typedef unsigned int        __v4su __attribute__((__vector_size__(16)));
-typedef unsigned short      __v8hu __attribute__((__vector_size__(16)));
-typedef unsigned char      __v16qu __attribute__((__vector_size__(16)));
-typedef unsigned long long  __v4du __attribute__((__vector_size__(32)));
-typedef unsigned int        __v8su __attribute__((__vector_size__(32)));
-typedef unsigned short     __v16hu __attribute__((__vector_size__(32)));
-typedef unsigned char      __v32qu __attribute__((__vector_size__(32)));
-#endif
-#ifdef __INTEL_COMPILER
-typedef int   __v16si __attribute__((__vector_size__(64)));
-typedef short __v32hi __attribute__((__vector_size__(64)));
-typedef char  __v64qi __attribute__((__vector_size__(64)));
+#if HAVE_DYNAMIC_X86_CPU_FEATURES && \
+	(GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER))
+#  define HAVE_TARGET_INTRINSICS	1
+#else
+#  define HAVE_TARGET_INTRINSICS	0
 #endif
 
 /* SSE2 */
-#ifdef __SSE2__
+#if defined(__SSE2__) || \
+	(defined(_MSC_VER) && \
+	 (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)))
 #  define HAVE_SSE2_NATIVE	1
 #else
 #  define HAVE_SSE2_NATIVE	0
 #endif
-#define HAVE_SSE2_TARGET	HAVE_DYNAMIC_X86_CPU_FEATURES
-#define HAVE_SSE2_INTRIN \
-	(HAVE_SSE2_NATIVE || (HAVE_SSE2_TARGET && HAVE_TARGET_INTRINSICS))
+#define HAVE_SSE2_INTRIN	(HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS)
 
 /* PCLMUL */
-#ifdef __PCLMUL__
+#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
 #  define HAVE_PCLMUL_NATIVE	1
 #else
 #  define HAVE_PCLMUL_NATIVE	0
 #endif
-#define HAVE_PCLMUL_TARGET \
-	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
-	 (GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128)))
-#define HAVE_PCLMUL_INTRIN \
-	(HAVE_PCLMUL_NATIVE || (HAVE_PCLMUL_TARGET && HAVE_TARGET_INTRINSICS))
+#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			   (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
+			    defined(_MSC_VER)))
+#  define HAVE_PCLMUL_INTRIN	1
+#else
+#  define HAVE_PCLMUL_INTRIN	0
+#endif
 
 /* AVX */
 #ifdef __AVX__
@@ -129,11 +110,13 @@ typedef char  __v64qi __attribute__((__vector_size__(64)));
 #else
 #  define HAVE_AVX_NATIVE	0
 #endif
-#define HAVE_AVX_TARGET \
-	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
-	 (GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256)))
-#define HAVE_AVX_INTRIN \
-	(HAVE_AVX_NATIVE || (HAVE_AVX_TARGET && HAVE_TARGET_INTRINSICS))
+#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			(GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \
+			 defined(_MSC_VER)))
+#  define HAVE_AVX_INTRIN	1
+#else
+#  define HAVE_AVX_INTRIN	0
+#endif
 
 /* AVX2 */
 #ifdef __AVX2__
@@ -141,24 +124,38 @@ typedef char  __v64qi __attribute__((__vector_size__(64)));
 #else
 #  define HAVE_AVX2_NATIVE	0
 #endif
-#define HAVE_AVX2_TARGET \
-	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
-	 (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256)))
-#define HAVE_AVX2_INTRIN \
-	(HAVE_AVX2_NATIVE || (HAVE_AVX2_TARGET && HAVE_TARGET_INTRINSICS))
+#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			 (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \
+			  defined(_MSC_VER)))
+#  define HAVE_AVX2_INTRIN	1
+#else
+#  define HAVE_AVX2_INTRIN	0
+#endif
 
 /* BMI2 */
-#ifdef __BMI2__
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
 #  define HAVE_BMI2_NATIVE	1
 #else
 #  define HAVE_BMI2_NATIVE	0
 #endif
-#define HAVE_BMI2_TARGET \
-	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
-	 (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)))
-#define HAVE_BMI2_INTRIN \
-	(HAVE_BMI2_NATIVE || (HAVE_BMI2_TARGET && HAVE_TARGET_INTRINSICS))
+#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			 (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \
+			  defined(_MSC_VER)))
+#  define HAVE_BMI2_INTRIN	1
+#else
+#  define HAVE_BMI2_INTRIN	0
+#endif
+/*
+ * MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
+ * intrinsics.  It seems to be fixed in VS2022.
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1930 /* older than VS2022 (toolset v143) */
+#  undef HAVE_BMI2_NATIVE
+#  undef HAVE_BMI2_INTRIN
+#  define HAVE_BMI2_NATIVE	0
+#  define HAVE_BMI2_INTRIN	0
+#endif
 
-#endif /* __i386__ || __x86_64__ */
+#endif /* ARCH_X86_32 || ARCH_X86_64 */
 
 #endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/2.0/libdeflate/lib/x86/crc32_impl.h b/2.0/libdeflate/lib/x86/crc32_impl.h
index 10ac8adc8..79cc7944e 100644
--- a/2.0/libdeflate/lib/x86/crc32_impl.h
+++ b/2.0/libdeflate/lib/x86/crc32_impl.h
@@ -37,7 +37,7 @@
 #  if HAVE_PCLMUL_NATIVE
 #    define ATTRIBUTES
 #  else
-#    define ATTRIBUTES		__attribute__((target("pclmul")))
+#    define ATTRIBUTES		_target_attribute("pclmul")
 #  endif
 #  define FOLD_PARTIAL_VECS	0
 #  include "crc32_pclmul_template.h"
@@ -52,16 +52,17 @@
  * SSSE3 and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for
  * efficient handling of partial blocks.  (We *could* compile a variant with
  * PCLMUL+SSSE3+SSE4.1 w/o AVX, but for simplicity we don't currently bother.)
+ *
+ * FIXME: with MSVC, this isn't actually compiled with AVX code generation
+ * enabled yet.  That would require that this be moved to its own .c file.
  */
-#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN && \
-	((HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE) || \
-	 (HAVE_PCLMUL_TARGET && HAVE_AVX_TARGET))
+#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN
 #  define crc32_x86_pclmul_avx	crc32_x86_pclmul_avx
 #  define SUFFIX			 _pclmul_avx
 #  if HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE
 #    define ATTRIBUTES
 #  else
-#    define ATTRIBUTES		__attribute__((target("pclmul,avx")))
+#    define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  endif
 #  define FOLD_PARTIAL_VECS	1
 #  include "crc32_pclmul_template.h"
diff --git a/2.0/libdeflate/lib/x86/crc32_pclmul_template.h b/2.0/libdeflate/lib/x86/crc32_pclmul_template.h
index a8ea91b41..1d5782375 100644
--- a/2.0/libdeflate/lib/x86/crc32_pclmul_template.h
+++ b/2.0/libdeflate/lib/x86/crc32_pclmul_template.h
@@ -49,10 +49,19 @@
  */
 
 #include <immintrin.h>
+/*
+ * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
+ * including some sub-headers.
+ */
+#if defined(__clang__) && defined(_MSC_VER)
+#  include <tmmintrin.h>
+#  include <smmintrin.h>
+#  include <wmmintrin.h>
+#endif
 
 #undef fold_vec
 static forceinline ATTRIBUTES __m128i
-ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __v2di multipliers)
+ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __m128i /* __v2di */ multipliers)
 {
 	/*
 	 * The immediate constant for PCLMULQDQ specifies which 64-bit halves of
@@ -61,8 +70,9 @@ ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __v2di multipliers)
 	 * 0x00 means low halves (higher degree polynomial terms for us)
 	 * 0x11 means high halves (lower degree polynomial terms for us)
 	 */
-	return dst ^ _mm_clmulepi64_si128(src, multipliers, 0x00) ^
-		_mm_clmulepi64_si128(src, multipliers, 0x11);
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00));
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11));
+	return dst;
 }
 #define fold_vec	ADD_SUFFIX(fold_vec)
 
@@ -77,7 +87,7 @@ ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __v2di multipliers)
 #undef fold_partial_vec
 static forceinline ATTRIBUTES __m128i
 ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len,
-			     __v2di multipliers_1)
+			     __m128i /* __v2du */ multipliers_1)
 {
 	/*
 	 * pshufb(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
@@ -115,13 +125,20 @@ ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len,
 static u32 ATTRIBUTES MAYBE_UNUSED
 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 {
-	const __v2di multipliers_8 = (__v2di)CRC32_8VECS_MULTS;
-	const __v2di multipliers_4 = (__v2di)CRC32_4VECS_MULTS;
-	const __v2di multipliers_2 = (__v2di)CRC32_2VECS_MULTS;
-	const __v2di multipliers_1 = (__v2di)CRC32_1VECS_MULTS;
-	const __v2di final_multiplier = (__v2di){ CRC32_FINAL_MULT };
-	const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
-	const __v2di barrett_reduction_constants = (__v2di)CRC32_BARRETT_CONSTANTS;
+	const __m128i /* __v2du */ multipliers_8 =
+		_mm_set_epi64x(CRC32_8VECS_MULT_2, CRC32_8VECS_MULT_1);
+	const __m128i /* __v2du */ multipliers_4 =
+		_mm_set_epi64x(CRC32_4VECS_MULT_2, CRC32_4VECS_MULT_1);
+	const __m128i /* __v2du */ multipliers_2 =
+		_mm_set_epi64x(CRC32_2VECS_MULT_2, CRC32_2VECS_MULT_1);
+	const __m128i /* __v2du */ multipliers_1 =
+		_mm_set_epi64x(CRC32_1VECS_MULT_2, CRC32_1VECS_MULT_1);
+	const __m128i /* __v2du */ final_multiplier =
+		_mm_set_epi64x(0, CRC32_FINAL_MULT);
+	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
+	const __m128i /* __v2du */ barrett_reduction_constants =
+		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2,
+			       CRC32_BARRETT_CONSTANT_1);
 	__m128i v0, v1, v2, v3, v4, v5, v6, v7;
 
 	/*
@@ -135,7 +152,8 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 		if (len < 16)
 			return crc32_slice1(crc, p, len);
 
-		v0 = _mm_loadu_si128((const void *)p) ^ (__m128i)(__v4si){crc};
+		v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
+				   _mm_cvtsi32_si128(crc));
 		p += 16;
 
 		if (len >= 64) {
@@ -187,7 +205,8 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 		const __m128i *vp;
 
 	#if FOLD_PARTIAL_VECS
-		v0 = _mm_loadu_si128((const void *)p) ^ (__m128i)(__v4si){crc};
+		v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
+				   _mm_cvtsi32_si128(crc));
 		p += 16;
 		/* Align p to the next 16-byte boundary. */
 		if (align) {
@@ -204,7 +223,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			len -= align;
 		}
 		vp = (const __m128i *)p;
-		v0 = *vp++ ^ (__m128i)(__v4si){crc};
+		v0 = _mm_xor_si128(*vp++, _mm_cvtsi32_si128(crc));
 	#endif
 		v1 = *vp++;
 		v2 = *vp++;
@@ -265,12 +284,13 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * which is equivalent to multiplying by x^32.  This is needed because
 	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
 	 */
-	v0 = _mm_srli_si128(v0, 8) ^
-	     _mm_clmulepi64_si128(v0, multipliers_1, 0x10);
+	v0 = _mm_xor_si128(_mm_srli_si128(v0, 8),
+			   _mm_clmulepi64_si128(v0, multipliers_1, 0x10));
 
 	/* Fold 96 => 64 bits. */
-	v0 = _mm_srli_si128(v0, 4) ^
-	     _mm_clmulepi64_si128(v0 & mask32, final_multiplier, 0x00);
+	v0 = _mm_xor_si128(_mm_srli_si128(v0, 4),
+			   _mm_clmulepi64_si128(_mm_and_si128(v0, mask32),
+						final_multiplier, 0x00));
 
 	/*
 	 * Reduce 64 => 32 bits using Barrett reduction.
@@ -314,10 +334,15 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
 	 *	                          \           x^32            /
 	 */
-	v1 = _mm_clmulepi64_si128(v0 & mask32, barrett_reduction_constants, 0x00);
-	v1 = _mm_clmulepi64_si128(v1 & mask32, barrett_reduction_constants, 0x10);
-	crc = ((__v4si)(v0 ^ v1))[1];
-#if !FOLD_PARTIAL_VECS
+	v1 = _mm_clmulepi64_si128(_mm_and_si128(v0, mask32),
+				  barrett_reduction_constants, 0x00);
+	v1 = _mm_clmulepi64_si128(_mm_and_si128(v1, mask32),
+				  barrett_reduction_constants, 0x10);
+	v0 = _mm_xor_si128(v0, v1);
+#if FOLD_PARTIAL_VECS
+	crc = _mm_extract_epi32(v0, 1);
+#else
+	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(v0, 0x01));
 	/* Process up to 15 bytes left over at the end. */
 	crc = crc32_slice1(crc, p, len);
 #endif
diff --git a/2.0/libdeflate/lib/x86/decompress_impl.h b/2.0/libdeflate/lib/x86/decompress_impl.h
index 3dc189285..3e2ec37e7 100644
--- a/2.0/libdeflate/lib/x86/decompress_impl.h
+++ b/2.0/libdeflate/lib/x86/decompress_impl.h
@@ -3,12 +3,17 @@
 
 #include "cpu_features.h"
 
-/* BMI2 optimized version */
+/*
+ * BMI2 optimized version
+ *
+ * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation
+ * enabled yet.  That would require that this be moved to its own .c file.
+ */
 #if HAVE_BMI2_INTRIN
 #  define deflate_decompress_bmi2	deflate_decompress_bmi2
 #  define FUNCNAME			deflate_decompress_bmi2
 #  if !HAVE_BMI2_NATIVE
-#    define ATTRIBUTES			__attribute__((target("bmi2")))
+#    define ATTRIBUTES			_target_attribute("bmi2")
 #  endif
    /*
     * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
@@ -20,7 +25,7 @@
     */
 #  ifndef __clang__
 #    include <immintrin.h>
-#    ifdef __x86_64__
+#    ifdef ARCH_X86_64
 #      define EXTRACT_VARBITS(word, count)  _bzhi_u64((word), (count))
 #      define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
 #    else
diff --git a/2.0/libdeflate/lib/x86/x86_cpu_features.c b/2.0/libdeflate/lib/x86/x86_cpu_features.c
index c9dd88a53..d51f947d6 100644
--- a/2.0/libdeflate/lib/x86/x86_cpu_features.c
+++ b/2.0/libdeflate/lib/x86/x86_cpu_features.c
@@ -30,50 +30,60 @@
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
 
-/* With old GCC versions we have to manually save and restore the x86_32 PIC
- * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
-#if defined(__i386__) && defined(__PIC__)
+/*
+ * With old GCC versions we have to manually save and restore the x86_32 PIC
+ * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602
+ */
+#if defined(ARCH_X86_32) && defined(__PIC__)
 #  define EBX_CONSTRAINT "=&r"
 #else
 #  define EBX_CONSTRAINT "=b"
 #endif
 
-/* Execute the CPUID instruction.  */
+/* Execute the CPUID instruction. */
 static inline void
 cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
 {
-	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
-		"cpuid                                  \n"
-		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
-		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
-		: "a" (leaf), "c" (subleaf));
+#ifdef _MSC_VER
+	int result[4];
+
+	__cpuidex(result, leaf, subleaf);
+	*a = result[0];
+	*b = result[1];
+	*c = result[2];
+	*d = result[3];
+#else
+	__asm__ volatile(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+			 "cpuid                                  \n"
+			 ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+			 : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+			 : "a" (leaf), "c" (subleaf));
+#endif
 }
 
-/* Read an extended control register.  */
+/* Read an extended control register. */
 static inline u64
 read_xcr(u32 indx)
 {
-	u32 edx, eax;
-
-	/* Execute the "xgetbv" instruction.  Old versions of binutils do not
-	 * recognize this instruction, so list the raw bytes instead.  */
-	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (indx));
-
-	return ((u64)edx << 32) | eax;
+#ifdef _MSC_VER
+	return _xgetbv(indx);
+#else
+	u32 d, a;
+
+	/*
+	 * Execute the "xgetbv" instruction.  Old versions of binutils do not
+	 * recognize this instruction, so list the raw bytes instead.
+	 *
+	 * This must be 'volatile' to prevent this code from being moved out
+	 * from under the check for OSXSAVE.
+	 */
+	__asm__ volatile(".byte 0x0f, 0x01, 0xd0" :
+			 "=d" (d), "=a" (a) : "c" (indx));
+
+	return ((u64)d << 32) | a;
+#endif
 }
 
-#undef BIT
-#define BIT(nr)			(1UL << (nr))
-
-#define XCR0_BIT_SSE		BIT(1)
-#define XCR0_BIT_AVX		BIT(2)
-#define XCR0_BIT_OPMASK		BIT(5)
-#define XCR0_BIT_ZMM_HI256	BIT(6)
-#define XCR0_BIT_HI16_ZMM	BIT(7)
-
-#define IS_SET(reg, nr)		((reg) & BIT(nr))
-#define IS_ALL_SET(reg, mask)	(((reg) & (mask)) == (mask))
-
 static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_SSE2,		"sse2"},
 	{X86_CPU_FEATURE_PCLMUL,	"pclmul"},
@@ -87,47 +97,34 @@ volatile u32 libdeflate_x86_cpu_features = 0;
 /* Initialize libdeflate_x86_cpu_features. */
 void libdeflate_init_x86_cpu_features(void)
 {
+	u32 max_leaf, a, b, c, d;
+	u64 xcr0 = 0;
 	u32 features = 0;
-	u32 dummy1, dummy2, dummy3, dummy4;
-	u32 max_function;
-	u32 features_1, features_2, features_3, features_4;
-	bool os_avx_support = false;
-
-	/* Get maximum supported function  */
-	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
-	if (max_function < 1)
-		goto out;
 
-	/* Standard feature flags  */
-	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
+	/* EAX=0: Highest Function Parameter and Manufacturer ID */
+	cpuid(0, 0, &max_leaf, &b, &c, &d);
+	if (max_leaf < 1)
+		goto out;
 
-	if (IS_SET(features_1, 26))
+	/* EAX=1: Processor Info and Feature Bits */
+	cpuid(1, 0, &a, &b, &c, &d);
+	if (d & (1 << 26))
 		features |= X86_CPU_FEATURE_SSE2;
-
-	if (IS_SET(features_2, 1))
+	if (c & (1 << 1))
 		features |= X86_CPU_FEATURE_PCLMUL;
-
-	if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
-		u64 xcr0 = read_xcr(0);
-
-		os_avx_support = IS_ALL_SET(xcr0,
-					    XCR0_BIT_SSE |
-					    XCR0_BIT_AVX);
-	}
-
-	if (os_avx_support && IS_SET(features_2, 28))
+	if (c & (1 << 27))
+		xcr0 = read_xcr(0);
+	if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
 		features |= X86_CPU_FEATURE_AVX;
 
-	if (max_function < 7)
+	if (max_leaf < 7)
 		goto out;
 
-	/* Extended feature flags  */
-	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
-
-	if (os_avx_support && IS_SET(features_3, 5))
+	/* EAX=7, ECX=0: Extended Features */
+	cpuid(7, 0, &a, &b, &c, &d);
+	if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
 		features |= X86_CPU_FEATURE_AVX2;
-
-	if (IS_SET(features_3, 8))
+	if (b & (1 << 8))
 		features |= X86_CPU_FEATURE_BMI2;
 
 out:
diff --git a/2.0/libdeflate/lib/zlib_compress.c b/2.0/libdeflate/lib/zlib_compress.c
index bf1dffa87..12d43602d 100644
--- a/2.0/libdeflate/lib/zlib_compress.c
+++ b/2.0/libdeflate/lib/zlib_compress.c
@@ -28,9 +28,7 @@
 #include "deflate_compress.h"
 #include "zlib_constants.h"
 
-#include "libdeflate.h"
-
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_zlib_compress(struct libdeflate_compressor *c,
 			 const void *in, size_t in_nbytes,
 			 void *out, size_t out_nbytes_avail)
@@ -75,7 +73,7 @@ libdeflate_zlib_compress(struct libdeflate_compressor *c,
 	return out_next - (u8 *)out;
 }
 
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
 			       size_t in_nbytes)
 {
diff --git a/2.0/libdeflate/lib/zlib_decompress.c b/2.0/libdeflate/lib/zlib_decompress.c
index 6c70603f9..f5e43eaeb 100644
--- a/2.0/libdeflate/lib/zlib_decompress.c
+++ b/2.0/libdeflate/lib/zlib_decompress.c
@@ -28,9 +28,7 @@
 #include "lib_common.h"
 #include "zlib_constants.h"
 
-#include "libdeflate.h"
-
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d,
 			      const void *in, size_t in_nbytes,
 			      void *out, size_t out_nbytes_avail,
@@ -94,7 +92,7 @@ libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d,
 	return LIBDEFLATE_SUCCESS;
 }
 
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
 			   const void *in, size_t in_nbytes,
 			   void *out, size_t out_nbytes_avail,
diff --git a/2.0/libdeflate/libdeflate.h b/2.0/libdeflate/libdeflate.h
index ffe402e2a..0a274f0f3 100644
--- a/2.0/libdeflate/libdeflate.h
+++ b/2.0/libdeflate/libdeflate.h
@@ -5,45 +5,29 @@
 #ifndef LIBDEFLATE_H
 #define LIBDEFLATE_H
 
+#include <stddef.h>
+#include <stdint.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define LIBDEFLATE_VERSION_MAJOR	1
-#define LIBDEFLATE_VERSION_MINOR	14
-#define LIBDEFLATE_VERSION_STRING	"1.14"
-
-#include <stddef.h>
-#include <stdint.h>
+#define LIBDEFLATE_VERSION_MINOR	19
+#define LIBDEFLATE_VERSION_STRING	"1.19"
 
 /*
- * On Windows, you must define LIBDEFLATE_STATIC if you are linking to the
- * static library version of libdeflate instead of the DLL.  On other platforms,
- * LIBDEFLATE_STATIC has no effect.
+ * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause
+ * __declspec(dllimport) to be used.  This should be done when it's easy to do.
+ * Otherwise it's fine to skip it, since it is a very minor performance
+ * optimization that is irrelevant for most use cases of libdeflate.
  */
-#ifdef _WIN32
-#  if defined(LIBDEFLATE_STATIC)
-#    define LIBDEFLATEEXPORT
-#  elif defined(BUILDING_LIBDEFLATE)
-#    define LIBDEFLATEEXPORT	__declspec(dllexport)
+#ifndef LIBDEFLATEAPI
+#  if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#    define LIBDEFLATEAPI	__declspec(dllimport)
 #  else
-#    define LIBDEFLATEEXPORT	__declspec(dllimport)
+#    define LIBDEFLATEAPI
 #  endif
-#else
-#  define LIBDEFLATEEXPORT	__attribute__((visibility("default")))
-#endif
-
-#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && defined(__i386__)
-    /*
-     * On i386, gcc assumes that the stack is 16-byte aligned at function entry.
-     * However, some compilers (e.g. MSVC) and programming languages (e.g.
-     * Delphi) only guarantee 4-byte alignment when calling functions.  Work
-     * around this ABI incompatibility by realigning the stack pointer when
-     * entering libdeflate.  This prevents crashes in SSE/AVX code.
-     */
-#  define LIBDEFLATEAPI	__attribute__((force_align_arg_pointer))
-#else
-#  define LIBDEFLATEAPI
 #endif
 
 /* ========================================================================== */
@@ -51,6 +35,7 @@ extern "C" {
 /* ========================================================================== */
 
 struct libdeflate_compressor;
+struct libdeflate_options;
 
 /*
  * libdeflate_alloc_compressor() allocates a new compressor that supports
@@ -70,15 +55,22 @@ struct libdeflate_compressor;
  * A single compressor is not safe to use by multiple threads concurrently.
  * However, different threads may use different compressors concurrently.
  */
-LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
+LIBDEFLATEAPI struct libdeflate_compressor *
 libdeflate_alloc_compressor(int compression_level);
 
+/*
+ * Like libdeflate_alloc_compressor(), but adds the 'options' argument.
+ */
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor_ex(int compression_level,
+			       const struct libdeflate_options *options);
+
 /*
  * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
  * data.  It attempts to compress 'in_nbytes' bytes of data located at 'in' and
- * write the results to 'out', which has space for 'out_nbytes_avail' bytes.
- * The return value is the compressed size in bytes, or 0 if the data could not
- * be compressed to 'out_nbytes_avail' bytes or fewer (but see note below).
+ * write the result to 'out', which has space for 'out_nbytes_avail' bytes.  The
+ * return value is the compressed size in bytes, or 0 if the data could not be
+ * compressed to 'out_nbytes_avail' bytes or fewer.
  *
  * If compression is successful, then the output data is guaranteed to be a
  * valid DEFLATE stream that decompresses to the input data.  No other
@@ -88,24 +80,8 @@ libdeflate_alloc_compressor(int compression_level);
  * writing tests that compare compressed data to a golden output, as this can
  * break when libdeflate is updated.  (This property isn't specific to
  * libdeflate; the same is true for zlib and other compression libraries too.)
- *
- * Note: due to a performance optimization, libdeflate_deflate_compress()
- * currently needs a small amount of slack space at the end of the output
- * buffer.  As a result, it can't actually report compressed sizes very close to
- * 'out_nbytes_avail'.  This doesn't matter in real-world use cases, and
- * libdeflate_deflate_compress_bound() already includes the slack space.
- * However, it does mean that testing code that redundantly compresses data
- * using an exact-sized output buffer won't work as might be expected:
- *
- *	out_nbytes = libdeflate_deflate_compress(c, in, in_nbytes, out,
- *						 libdeflate_deflate_compress_bound(in_nbytes));
- *	// The following assertion will fail.
- *	assert(libdeflate_deflate_compress(c, in, in_nbytes, out, out_nbytes) != 0);
- *
- * To avoid this, either don't write tests like the above, or make sure to
- * include at least 9 bytes of slack space in 'out_nbytes_avail'.
  */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
 			    const void *in, size_t in_nbytes,
 			    void *out, size_t out_nbytes_avail);
@@ -114,11 +90,10 @@ libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
  * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
  * number of bytes of compressed data that may be produced by compressing any
  * buffer of length less than or equal to 'in_nbytes' using
- * libdeflate_deflate_compress() with the specified compressor.  Mathematically,
- * this bound will necessarily be a number greater than or equal to 'in_nbytes'.
- * It may be an overestimate of the true upper bound.  The return value is
- * guaranteed to be the same for all invocations with the same compressor and
- * same 'in_nbytes'.
+ * libdeflate_deflate_compress() with the specified compressor.  This bound will
+ * necessarily be a number greater than or equal to 'in_nbytes'.  It may be an
+ * overestimate of the true upper bound.  The return value is guaranteed to be
+ * the same for all invocations with the same compressor and same 'in_nbytes'.
  *
  * As a special case, 'compressor' may be NULL.  This causes the bound to be
  * taken across *any* libdeflate_compressor that could ever be allocated with
@@ -135,15 +110,15 @@ libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
  * libdeflate_deflate_compress() returns 0, indicating that the compressed data
  * did not fit into the provided output buffer.
  */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
 				  size_t in_nbytes);
 
 /*
- * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper
- * format.
+ * Like libdeflate_deflate_compress(), but uses the zlib wrapper format instead
+ * of raw DEFLATE.
  */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
 			 const void *in, size_t in_nbytes,
 			 void *out, size_t out_nbytes_avail);
@@ -153,15 +128,15 @@ libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
  * compressed with libdeflate_zlib_compress() rather than with
  * libdeflate_deflate_compress().
  */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
 			       size_t in_nbytes);
 
 /*
- * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper
- * format.
+ * Like libdeflate_deflate_compress(), but uses the gzip wrapper format instead
+ * of raw DEFLATE.
  */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
 			 const void *in, size_t in_nbytes,
 			 void *out, size_t out_nbytes_avail);
@@ -171,7 +146,7 @@ libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
  * compressed with libdeflate_gzip_compress() rather than with
  * libdeflate_deflate_compress().
  */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
+LIBDEFLATEAPI size_t
 libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
 			       size_t in_nbytes);
 
@@ -180,7 +155,7 @@ libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
  * libdeflate_alloc_compressor().  If a NULL pointer is passed in, no action is
  * taken.
  */
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
+LIBDEFLATEAPI void
 libdeflate_free_compressor(struct libdeflate_compressor *compressor);
 
 /* ========================================================================== */
@@ -188,6 +163,7 @@ libdeflate_free_compressor(struct libdeflate_compressor *compressor);
 /* ========================================================================== */
 
 struct libdeflate_decompressor;
+struct libdeflate_options;
 
 /*
  * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
@@ -201,9 +177,15 @@ struct libdeflate_decompressor;
  * A single decompressor is not safe to use by multiple threads concurrently.
  * However, different threads may use different decompressors concurrently.
  */
-LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI
+LIBDEFLATEAPI struct libdeflate_decompressor *
 libdeflate_alloc_decompressor(void);
 
+/*
+ * Like libdeflate_alloc_decompressor(), but adds the 'options' argument.
+ */
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options);
+
 /*
  * Result of a call to libdeflate_deflate_decompress(),
  * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
@@ -212,8 +194,8 @@ enum libdeflate_result {
 	/* Decompression was successful.  */
 	LIBDEFLATE_SUCCESS = 0,
 
-	/* Decompressed failed because the compressed data was invalid, corrupt,
-	 * or otherwise unsupported.  */
+	/* Decompression failed because the compressed data was invalid,
+	 * corrupt, or otherwise unsupported.  */
 	LIBDEFLATE_BAD_DATA = 1,
 
 	/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
@@ -226,13 +208,12 @@ enum libdeflate_result {
 };
 
 /*
- * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
- * from the buffer 'in' with compressed size up to 'in_nbytes' bytes.  The
- * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
- * bytes.  If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
- * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned.  If
- * a nonzero result code is returned, then the contents of the output buffer are
- * undefined.
+ * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer
+ * 'in' with compressed size up to 'in_nbytes' bytes.  The uncompressed data is
+ * written to 'out', a buffer with size 'out_nbytes_avail' bytes.  If
+ * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.  Otherwise,
+ * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the
+ * contents of the output buffer are undefined.
  *
  * Decompression stops at the end of the DEFLATE stream (as indicated by the
  * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
@@ -257,7 +238,7 @@ enum libdeflate_result {
  *     not large enough but no other problems were encountered, or another
  *     nonzero result code if decompression failed for another reason.
  */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
 			      const void *in, size_t in_nbytes,
 			      void *out, size_t out_nbytes_avail,
@@ -269,7 +250,7 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
  * then the actual compressed size of the DEFLATE stream (aligned to the next
  * byte boundary) is written to *actual_in_nbytes_ret.
  */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
 				 const void *in, size_t in_nbytes,
 				 void *out, size_t out_nbytes_avail,
@@ -284,7 +265,7 @@ libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
  * than 'in_nbytes'.  If you need to know exactly where the zlib stream ended,
  * use libdeflate_zlib_decompress_ex().
  */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
 			   const void *in, size_t in_nbytes,
 			   void *out, size_t out_nbytes_avail,
@@ -297,7 +278,7 @@ libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
  * buffer was decompressed), then the actual number of input bytes consumed is
  * written to *actual_in_nbytes_ret.
  */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor,
 			      const void *in, size_t in_nbytes,
 			      void *out, size_t out_nbytes_avail,
@@ -312,7 +293,7 @@ libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor,
  * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
  * multi-member support.
  */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
 			   const void *in, size_t in_nbytes,
 			   void *out, size_t out_nbytes_avail,
@@ -325,7 +306,7 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
  * buffer was decompressed), then the actual number of input bytes consumed is
  * written to *actual_in_nbytes_ret.
  */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
+LIBDEFLATEAPI enum libdeflate_result
 libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
 			      const void *in, size_t in_nbytes,
 			      void *out, size_t out_nbytes_avail,
@@ -337,7 +318,7 @@ libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
  * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
  * is taken.
  */
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
+LIBDEFLATEAPI void
 libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
 
 /* ========================================================================== */
@@ -350,7 +331,7 @@ libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
  * required initial value for 'adler' is 1.  This value is also returned when
  * 'buffer' is specified as NULL.
  */
-LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
+LIBDEFLATEAPI uint32_t
 libdeflate_adler32(uint32_t adler, const void *buffer, size_t len);
 
 
@@ -360,7 +341,7 @@ libdeflate_adler32(uint32_t adler, const void *buffer, size_t len);
  * initial value for 'crc' is 0.  This value is also returned when 'buffer' is
  * specified as NULL.
  */
-LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
+LIBDEFLATEAPI uint32_t
 libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
 
 /* ========================================================================== */
@@ -369,16 +350,60 @@ libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
 
 /*
  * Install a custom memory allocator which libdeflate will use for all memory
- * allocations.  'malloc_func' is a function that must behave like malloc(), and
- * 'free_func' is a function that must behave like free().
+ * allocations by default.  'malloc_func' is a function that must behave like
+ * malloc(), and 'free_func' is a function that must behave like free().
+ *
+ * The per-(de)compressor custom memory allocator that can be specified in
+ * 'struct libdeflate_options' takes priority over this.
  *
- * There must not be any libdeflate_compressor or libdeflate_decompressor
- * structures in existence when calling this function.
+ * This doesn't affect the free() function that will be used to free
+ * (de)compressors that were already in existence when this is called.
  */
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
+LIBDEFLATEAPI void
 libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
 				void (*free_func)(void *));
 
+/*
+ * Advanced options.  This is the options structure that
+ * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex()
+ * require.  Most users won't need this and should just use the non-"_ex"
+ * functions instead.  If you do need this, it should be initialized like this:
+ *
+ *	struct libdeflate_options options;
+ *
+ *	memset(&options, 0, sizeof(options));
+ *	options.sizeof_options = sizeof(options);
+ *	// Then set the fields that you need to override the defaults for.
+ */
+struct libdeflate_options {
+
+	/*
+	 * This field must be set to the struct size.  This field exists for
+	 * extensibility, so that fields can be appended to this struct in
+	 * future versions of libdeflate while still supporting old binaries.
+	 */
+	size_t sizeof_options;
+
+	/*
+	 * An optional custom memory allocator to use for this (de)compressor.
+	 * 'malloc_func' must be a function that behaves like malloc(), and
+	 * 'free_func' must be a function that behaves like free().
+	 *
+	 * This is useful in cases where a process might have multiple users of
+	 * libdeflate who want to use different memory allocators.  For example,
+	 * a library might want to use libdeflate with a custom memory allocator
+	 * without interfering with user code that might use libdeflate too.
+	 *
+	 * This takes priority over the "global" memory allocator (which by
+	 * default is malloc() and free(), but can be changed by
+	 * libdeflate_set_memory_allocator()).  Moreover, libdeflate will never
+	 * call the "global" memory allocator if a per-(de)compressor custom
+	 * allocator is always given.
+	 */
+	void *(*malloc_func)(size_t);
+	void (*free_func)(void *);
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/2.0/plink2.cc b/2.0/plink2.cc
index 72bdec56e..3f771c9a2 100644
--- a/2.0/plink2.cc
+++ b/2.0/plink2.cc
@@ -72,7 +72,7 @@ static const char ver_str[] = "PLINK v2.00a6"
 #elif defined(USE_AOCL)
   " AMD"
 #endif
-  " (27 Sep 2023)";
+  " (28 Sep 2023)";
 static const char ver_str2[] =
   // include leading space if day < 10, so character length stays the same
   ""