From 32a773961bcfeccae6ca7560dce15625e30d766d Mon Sep 17 00:00:00 2001 From: Lucas Sloan Date: Wed, 27 Mar 2024 19:17:41 -0700 Subject: [PATCH] Implement brotli's version of lz_extend. This provides a 2-3% speed up for compression. Before: ``` Compression Total Compressed | Compression | Method Size Size Ratio | Iters Time Rate | ---------------------------------------------------------------------------------- libdeflate-gzip:1 270062086 85630362 3.154 | 20 26.493 s 194.43 MB/s | libdeflate-gzip:2 270062086 84037129 3.214 | 20 35.747 s 144.10 MB/s | libdeflate-gzip:3 270062086 82391861 3.278 | 20 39.707 s 129.73 MB/s | libdeflate-gzip:4 270062086 81420541 3.317 | 20 43.029 s 119.71 MB/s | libdeflate-gzip:5 270062086 78832080 3.426 | 20 50.630 s 101.74 MB/s | libdeflate-gzip:6 270062086 78021372 3.461 | 20 63.719 s 80.84 MB/s | libdeflate-gzip:7 270062086 77594012 3.480 | 20 87.918 s 58.59 MB/s | libdeflate-gzip:8 270062086 77190199 3.499 | 20 147.452 s 34.93 MB/s | libdeflate-gzip:9 270062086 77156775 3.500 | 20 191.025 s 26.97 MB/s | ``` After: ``` Compression Total Compressed | Compression | Method Size Size Ratio | Iters Time Rate | ---------------------------------------------------------------------------------- libdeflate-gzip:1 270062086 85630362 3.154 | 20 26.228 s 196.39 MB/s | libdeflate-gzip:2 270062086 84037129 3.214 | 20 34.950 s 147.38 MB/s | libdeflate-gzip:3 270062086 82391861 3.278 | 20 39.140 s 131.61 MB/s | libdeflate-gzip:4 270062086 81420541 3.317 | 20 41.927 s 122.86 MB/s | libdeflate-gzip:5 270062086 78832080 3.426 | 20 50.023 s 102.97 MB/s | libdeflate-gzip:6 270062086 78021372 3.461 | 20 61.799 s 83.35 MB/s | libdeflate-gzip:7 270062086 77594012 3.480 | 20 85.528 s 60.23 MB/s | libdeflate-gzip:8 270062086 77190199 3.499 | 20 145.867 s 35.31 MB/s | libdeflate-gzip:9 270062086 77156775 3.500 | 20 189.208 s 27.22 MB/s | ``` --- lib/matchfinder_common.h | 62 ++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/lib/matchfinder_common.h b/lib/matchfinder_common.h index 9c8be243..03c1c23a 100644 --- a/lib/matchfinder_common.h +++ b/lib/matchfinder_common.h @@ -176,49 +176,37 @@ lz_hash(u32 seq, unsigned num_bits) * to a maximum of @max_len. Initially, @start_len bytes are matched. */ static forceinline unsigned -lz_extend(const u8 * const strptr, const u8 * const matchptr, - const unsigned start_len, const unsigned max_len) +lz_extend(const u8 * strptr, const u8 * matchptr, + const unsigned start_len, unsigned max_len) { - unsigned len = start_len; - machine_word_t v_word; + const u8 * const matchptr_orig = matchptr; + strptr += start_len; + matchptr += start_len; + max_len -= start_len; if (UNALIGNED_ACCESS_IS_FAST) { - - if (likely(max_len - len >= 4 * WORDBYTES)) { - - #define COMPARE_WORD_STEP \ - v_word = load_word_unaligned(&matchptr[len]) ^ \ - load_word_unaligned(&strptr[len]); \ - if (v_word != 0) \ - goto word_differs; \ - len += WORDBYTES; \ - - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - #undef COMPARE_WORD_STEP - } - - while (len + WORDBYTES <= max_len) { - v_word = load_word_unaligned(&matchptr[len]) ^ - load_word_unaligned(&strptr[len]); - if (v_word != 0) - goto word_differs; - len += WORDBYTES; + for (; max_len >= WORDBYTES; max_len -= WORDBYTES) { + machine_word_t v_word = load_word_unaligned(strptr) ^ + load_word_unaligned(matchptr); + strptr += WORDBYTES; + if (v_word != 0) { + unsigned matching_bits; + if (CPU_IS_LITTLE_ENDIAN()) + matching_bits = bsfw(v_word); + else + matching_bits = WORDBITS - 1 - bsrw(v_word); + return (unsigned)(matchptr - matchptr_orig) + (matching_bits >> 3); + } + matchptr += WORDBYTES; } } - while (len < max_len && matchptr[len] == strptr[len]) - len++; - return len; - -word_differs: - if (CPU_IS_LITTLE_ENDIAN()) - len += (bsfw(v_word) >> 3); - else - len += (WORDBITS - 1 - bsrw(v_word)) >> 3; - return len; + while (max_len && *matchptr == *strptr) { + max_len--; + ++strptr; + ++matchptr; + } + return (unsigned)(matchptr - matchptr_orig); } #endif /* LIB_MATCHFINDER_COMMON_H */