Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recompression #824

Open
wants to merge 15 commits into
base: recompression
Choose a base branch
from
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,25 @@ https://groups.google.com/forum/#!forum/brotli
[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/google/brotli?branch=master&svg=true)](https://ci.appveyor.com/project/szabadka/brotli)
[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/brotli.svg)](https://oss-fuzz-build-logs.storage.googleapis.com/index.html#brotli)

### **How to use for recompression**

`BrotliEncoderCompressSimilarDeletion` inside `compress\_similar\_files\compress\_similar\_files.h` is a main function to use for recompression.
It takes:
* Input compressed data
* _start_ and _end_ positions of the area to delete (indexed as in uncompressed input data)
* A buffer for output data
* Some other parameters as in original main compression Brotli function `BrotliEncoderCompress`, e.g. level.
As a result, an output buffer contains a compressed file for file which is uncompressed input file without [start, end) range.

An example of use is in the file `compress\_similar\_files\example.cc`.
To build and run `example.cc` do the following inside `compress\_similar\_files` directory:

make
./example level file_name start end

An explanation of how it works is in `recompression_doc.md`


### Build instructions

#### Vcpkg
Expand Down
2 changes: 1 addition & 1 deletion c/common/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ static BROTLI_INLINE void BrotliUnalignedWrite64(void* p, uint64_t v) {
typedef BROTLI_ALIGNED(1) uint64_t brotli_unaligned_uint64_t;

static BROTLI_INLINE uint64_t BrotliUnalignedRead64(const void* p) {
return (uint64_t) ((const brotli_unaligned_uint64_t*) p)[0];
return (uint64_t) ((brotli_unaligned_uint64_t*) p)[0];
}
static BROTLI_INLINE void BrotliUnalignedWrite64(void* p, uint64_t v) {
brotli_unaligned_uint64_t* dwords = (brotli_unaligned_uint64_t*) p;
Expand Down
432 changes: 432 additions & 0 deletions c/compress_similar_files/example.cc

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions c/compress_similar_files/makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright 2020 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

CXX=g++
CXXFLAGS=-g -Wall -MMD -std=c++11
LDLIBS=-lstdc++ -lbrotlienc -lbrotlidec -lz

all: example

#example.o: example.cc
# g++ -std=c++11 -c example.cc

example: example.o

clean:
rm example.o example
94 changes: 79 additions & 15 deletions c/dec/decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <stdlib.h> /* free, malloc */
#include <string.h> /* memcpy, memset */
#include <stdio.h> /* fprintf */ ///!!!!

#include "../common/constants.h"
#include "../common/context.h"
Expand Down Expand Up @@ -70,6 +71,10 @@ BROTLI_BOOL BrotliDecoderSetParameter(
state->large_window = TO_BROTLI_BOOL(!!value);
return BROTLI_TRUE;

case BROTLI_DECODER_PARAM_SAVE_INFO:
state->save_info_for_recompression = TO_BROTLI_BOOL(!!value);
return BROTLI_TRUE;

default: return BROTLI_FALSE;
}
}
Expand Down Expand Up @@ -125,6 +130,7 @@ static BROTLI_NOINLINE BrotliDecoderResult SaveErrorCode(
return BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT;

default:

return BROTLI_DECODER_RESULT_ERROR;
}
}
Expand Down Expand Up @@ -1194,41 +1200,72 @@ static BROTLI_INLINE void PrepareLiteralDecoding(BrotliDecoderState* s) {
/* Decodes the block type and updates the state for literal context.
Reads 3..54 bits. */
static BROTLI_INLINE BROTLI_BOOL DecodeLiteralBlockSwitchInternal(
int safe, BrotliDecoderState* s) {
int safe, BrotliDecoderState* s, int position) {
if (!DecodeBlockTypeAndLength(safe, s, 0)) {
return BROTLI_FALSE;
}
/* If needed save the end of a previous block and the start of a new block */
if (s->save_info_for_recompression) {
/* Save the end only if previously saved a start */
if (s->saved_position_literals_begin) {
s->literals_block_splits.positions_end[s->literals_block_splits.num_blocks] = position;
s->literals_block_splits.num_blocks++;
}
s->literals_block_splits.positions_begin[s->literals_block_splits.num_blocks] = position;
s->literals_block_splits.types[s->literals_block_splits.num_blocks] =
s->block_type_rb[0 * 2 + 1] +
s->literals_block_splits.num_types_prev_metablocks;
s->literals_block_splits.num_types =
BROTLI_MAX(size_t, s->literals_block_splits.num_types,
s->literals_block_splits.types[s->literals_block_splits.num_blocks] + 1);
}
PrepareLiteralDecoding(s);
return BROTLI_TRUE;
}

static void BROTLI_NOINLINE DecodeLiteralBlockSwitch(BrotliDecoderState* s) {
DecodeLiteralBlockSwitchInternal(0, s);
static void BROTLI_NOINLINE DecodeLiteralBlockSwitch(BrotliDecoderState* s, int position) {
DecodeLiteralBlockSwitchInternal(0, s, position);
}

static BROTLI_BOOL BROTLI_NOINLINE SafeDecodeLiteralBlockSwitch(
BrotliDecoderState* s) {
return DecodeLiteralBlockSwitchInternal(1, s);
BrotliDecoderState* s, int position) {
return DecodeLiteralBlockSwitchInternal(1, s, position);
}

/* Block switch for insert/copy length.
Reads 3..54 bits. */
static BROTLI_INLINE BROTLI_BOOL DecodeCommandBlockSwitchInternal(
int safe, BrotliDecoderState* s) {
int safe, BrotliDecoderState* s, int position) {
if (!DecodeBlockTypeAndLength(safe, s, 1)) {
return BROTLI_FALSE;
}
s->htree_command = s->insert_copy_hgroup.htrees[s->block_type_rb[3]];
/* If needed save the start of a previous block and the start of a new block */
if (s->save_info_for_recompression) {
/* Save the end only if previously saved a start */
if (s->saved_position_lengths_begin) {
s->insert_copy_length_block_splits.positions_end[s->insert_copy_length_block_splits.num_blocks] = position;
s->insert_copy_length_block_splits.num_blocks++;
}
s->insert_copy_length_block_splits.positions_begin[s->insert_copy_length_block_splits.num_blocks] = position;
s->insert_copy_length_block_splits.types[s->insert_copy_length_block_splits.num_blocks] =
s->block_type_rb[3] +
s->insert_copy_length_block_splits.num_types_prev_metablocks;
s->insert_copy_length_block_splits.num_types =
BROTLI_MAX(size_t, s->insert_copy_length_block_splits.num_types,
s->insert_copy_length_block_splits.types[s->insert_copy_length_block_splits.num_blocks] + 1);
}
return BROTLI_TRUE;
}

static void BROTLI_NOINLINE DecodeCommandBlockSwitch(BrotliDecoderState* s) {
DecodeCommandBlockSwitchInternal(0, s);
static void BROTLI_NOINLINE DecodeCommandBlockSwitch(BrotliDecoderState* s,
int position) {
DecodeCommandBlockSwitchInternal(0, s, position);
}

static BROTLI_BOOL BROTLI_NOINLINE SafeDecodeCommandBlockSwitch(
BrotliDecoderState* s) {
return DecodeCommandBlockSwitchInternal(1, s);
BrotliDecoderState* s, int position) {
return DecodeCommandBlockSwitchInternal(1, s, position);
}

/* Block switch for distance codes.
Expand Down Expand Up @@ -1736,7 +1773,6 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
int i = s->loop_counter;
BrotliDecoderErrorCode result = BROTLI_DECODER_SUCCESS;
BrotliBitReader* br = &s->br;

if (!CheckInputAmount(safe, br, 28)) {
result = BROTLI_DECODER_NEEDS_MORE_INPUT;
goto saveStateAndReturn;
Expand Down Expand Up @@ -1768,7 +1804,7 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
goto saveStateAndReturn;
}
if (BROTLI_PREDICT_FALSE(s->block_length[1] == 0)) {
BROTLI_SAFE(DecodeCommandBlockSwitch(s));
BROTLI_SAFE(DecodeCommandBlockSwitch(s, pos + (s->rb_roundtrips << s->window_bits)));
goto CommandBegin;
}
/* Read the insert/copy length in the command. */
Expand Down Expand Up @@ -1796,7 +1832,7 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
goto saveStateAndReturn;
}
if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
BROTLI_SAFE(DecodeLiteralBlockSwitch(s));
BROTLI_SAFE(DecodeLiteralBlockSwitch(s, pos + (s->rb_roundtrips << s->window_bits)));
PreloadSymbol(safe, s->literal_htree, br, &bits, &value);
if (!s->trivial_literal_context) goto CommandInner;
}
Expand Down Expand Up @@ -1832,7 +1868,7 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
goto saveStateAndReturn;
}
if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
BROTLI_SAFE(DecodeLiteralBlockSwitch(s));
BROTLI_SAFE(DecodeLiteralBlockSwitch(s, pos + (s->rb_roundtrips << s->window_bits)));
if (s->trivial_literal_context) goto CommandInner;
}
context = BROTLI_CONTEXT(p1, p2, s->context_lookup);
Expand Down Expand Up @@ -1889,6 +1925,14 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
s->max_distance =
(pos < s->max_backward_distance) ? pos : s->max_backward_distance;
}
/* Save backward reference info if needed */
if (s->save_info_for_recompression) {
s->commands[s->commands_size].copy_len = s->copy_length;
s->commands[s->commands_size].distance = s->distance_code;
s->commands[s->commands_size].position = pos + (s->rb_roundtrips << s->window_bits);
s->commands[s->commands_size].max_distance = s->max_distance;
++s->commands_size;
}
i = s->copy_length;
/* Apply copy of LZ77 back-reference, or static dictionary reference if
the distance is larger than the max LZ77 distance */
Expand Down Expand Up @@ -2033,14 +2077,20 @@ static BROTLI_NOINLINE BrotliDecoderErrorCode SafeProcessCommands(

BrotliDecoderResult BrotliDecoderDecompress(
size_t encoded_size, const uint8_t* encoded_buffer, size_t* decoded_size,
uint8_t* decoded_buffer) {
uint8_t* decoded_buffer, BROTLI_BOOL save_info_for_recompression,
BackwardReferenceFromDecoder** backward_references,
size_t* backward_references_size,
BlockSplitFromDecoder* literals_block_splits,
BlockSplitFromDecoder* insert_copy_length_block_splits) {
BrotliDecoderState s;
BrotliDecoderResult result;
size_t total_out = 0;
size_t available_in = encoded_size;
const uint8_t* next_in = encoded_buffer;
size_t available_out = *decoded_size;
uint8_t* next_out = decoded_buffer;
s.save_info_for_recompression = save_info_for_recompression;

if (!BrotliDecoderStateInit(&s, 0, 0, 0)) {
return BROTLI_DECODER_RESULT_ERROR;
}
Expand All @@ -2051,6 +2101,14 @@ BrotliDecoderResult BrotliDecoderDecompress(
if (result != BROTLI_DECODER_RESULT_SUCCESS) {
result = BROTLI_DECODER_RESULT_ERROR;
}
if (s.save_info_for_recompression) {
*backward_references = s.commands;
*backward_references_size = s.commands_size;
*literals_block_splits = s.literals_block_splits;
*insert_copy_length_block_splits = s.insert_copy_length_block_splits;
}


return result;
}

Expand All @@ -2070,6 +2128,12 @@ BrotliDecoderResult BrotliDecoderDecompressStream(
size_t* available_out, uint8_t** next_out, size_t* total_out) {
BrotliDecoderErrorCode result = BROTLI_DECODER_SUCCESS;
BrotliBitReader* br = &s->br;
/* Will save a commands here to use for the recompression */
if (s->save_info_for_recompression && !s->commands) {
s->commands = (BackwardReferenceFromDecoder*)BROTLI_DECODER_ALLOC(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be using this ALLOC function instead of malloc elsewhere? If so, maybe add a TODO to make sure this happens as part of productization?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Everything inside decoder/encoder should allocate/free memory via macros to allow "custom memory manager" feature.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inside Encoder/Decoder I'm using Brotli allocation, malloc is used only in testing and in compress_similar_files.h file as they are not a part of the library. However, when compress_similar_files.h will be a part of the library then Brotli macros should be used

s, sizeof(BackwardReferenceFromDecoder) * (int)((float)*available_in));
s->commands_alloc_size = *available_in;
}
/* Ensure that |total_out| is set, even if no data will ever be pushed out. */
if (total_out) {
*total_out = s->partial_pos_out;
Expand Down
60 changes: 60 additions & 0 deletions c/dec/state.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,32 @@ BROTLI_BOOL BrotliDecoderStateInit(BrotliDecoderState* s,
s->rb_roundtrips = 0;
s->partial_pos_out = 0;

s->commands = NULL;
s->commands_size = 0;
if (s->save_info_for_recompression) {
s->literals_block_splits.types = (uint8_t*)BROTLI_DECODER_ALLOC(s, sizeof(uint8_t) * 100000);
s->literals_block_splits.positions_begin = (uint32_t*)BROTLI_DECODER_ALLOC(s, sizeof(uint32_t) * 100000);
s->literals_block_splits.positions_end = (uint32_t*)BROTLI_DECODER_ALLOC(s, sizeof(uint32_t) * 100000);
s->literals_block_splits.num_types = 0;
s->literals_block_splits.num_types_prev_metablocks = 0;
s->literals_block_splits.num_blocks = 0;
s->literals_block_splits.types_alloc_size = 100000;
s->literals_block_splits.positions_alloc_size = 100000;

s->insert_copy_length_block_splits.types = (uint8_t*)BROTLI_DECODER_ALLOC(s, sizeof(uint8_t) * 100000);
s->insert_copy_length_block_splits.positions_begin = (uint32_t*)BROTLI_DECODER_ALLOC(s, sizeof(uint32_t) * 100000);
s->insert_copy_length_block_splits.positions_end = (uint32_t*)BROTLI_DECODER_ALLOC(s, sizeof(uint32_t) * 100000);
s->insert_copy_length_block_splits.num_types = 0;
s->insert_copy_length_block_splits.num_types_prev_metablocks = 0;
s->insert_copy_length_block_splits.num_blocks = 0;
s->insert_copy_length_block_splits.types_alloc_size = 100000;
s->insert_copy_length_block_splits.positions_alloc_size = 100000;
}


s->saved_position_literals_begin = BROTLI_FALSE;
s->saved_position_lengths_begin = BROTLI_FALSE;

s->block_type_trees = NULL;
s->block_len_trees = NULL;
s->ringbuffer = NULL;
Expand Down Expand Up @@ -115,6 +141,23 @@ void BrotliDecoderStateMetablockBegin(BrotliDecoderState* s) {
s->insert_copy_hgroup.htrees = NULL;
s->distance_hgroup.codes = NULL;
s->distance_hgroup.htrees = NULL;

/* If needed save the start of a first in metablock block */
if (s->save_info_for_recompression) {
s->literals_block_splits.types[s->literals_block_splits.num_blocks] = s->literals_block_splits.num_types_prev_metablocks;
s->literals_block_splits.positions_begin[s->literals_block_splits.num_blocks] = s->pos + (s->rb_roundtrips << s->window_bits);
s->saved_position_literals_begin = BROTLI_TRUE;
s->literals_block_splits.num_types =
BROTLI_MAX(size_t, s->literals_block_splits.num_types,
s->literals_block_splits.types[s->literals_block_splits.num_blocks] + 1);

s->insert_copy_length_block_splits.types[s->insert_copy_length_block_splits.num_blocks] = s->insert_copy_length_block_splits.num_types_prev_metablocks;
s->insert_copy_length_block_splits.positions_begin[s->insert_copy_length_block_splits.num_blocks] = s->pos + (s->rb_roundtrips << s->window_bits);
s->saved_position_lengths_begin = BROTLI_TRUE;
s->insert_copy_length_block_splits.num_types =
BROTLI_MAX(size_t, s->insert_copy_length_block_splits.num_types,
s->insert_copy_length_block_splits.types[s->insert_copy_length_block_splits.num_blocks] + 1);
}
}

void BrotliDecoderStateCleanupAfterMetablock(BrotliDecoderState* s) {
Expand All @@ -124,6 +167,23 @@ void BrotliDecoderStateCleanupAfterMetablock(BrotliDecoderState* s) {
BROTLI_DECODER_FREE(s, s->literal_hgroup.htrees);
BROTLI_DECODER_FREE(s, s->insert_copy_hgroup.htrees);
BROTLI_DECODER_FREE(s, s->distance_hgroup.htrees);

/* If needed save the end of a last in metablock block */
if (s->save_info_for_recompression) {
/* Save the end only if previously saved a start */
if (s->saved_position_literals_begin) {
s->literals_block_splits.positions_end[s->literals_block_splits.num_blocks] = s->pos + (s->rb_roundtrips << s->window_bits);
s->literals_block_splits.num_blocks++;
s->literals_block_splits.num_types_prev_metablocks = s->literals_block_splits.num_types;
s->saved_position_literals_begin = BROTLI_FALSE;
}
if (s->saved_position_lengths_begin) {
s->insert_copy_length_block_splits.positions_end[s->insert_copy_length_block_splits.num_blocks] = s->pos + (s->rb_roundtrips << s->window_bits);
s->insert_copy_length_block_splits.num_blocks++;
s->insert_copy_length_block_splits.num_types_prev_metablocks = s->insert_copy_length_block_splits.num_types;
s->saved_position_lengths_begin = BROTLI_FALSE;
}
}
}

void BrotliDecoderStateCleanup(BrotliDecoderState* s) {
Expand Down
11 changes: 11 additions & 0 deletions c/dec/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <brotli/types.h>
#include "./bit_reader.h"
#include "./huffman.h"
#include "../include/brotli/decode.h"

#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
Expand Down Expand Up @@ -233,6 +234,7 @@ typedef struct BrotliMetablockBodyArena {
uint32_t dist_offset[544];
} BrotliMetablockBodyArena;


struct BrotliDecoderStateStruct {
BrotliRunningState state;

Expand All @@ -245,6 +247,14 @@ struct BrotliDecoderStateStruct {
brotli_free_func free_func;
void* memory_manager_opaque;

BackwardReferenceFromDecoder* commands;
size_t commands_size;
size_t commands_alloc_size;

BlockSplitFromDecoder literals_block_splits;
BROTLI_BOOL saved_position_literals_begin;
BlockSplitFromDecoder insert_copy_length_block_splits;
BROTLI_BOOL saved_position_lengths_begin;
/* Temporary storage for remaining input. Brotli stream format is designed in
a way, that 64 bits are enough to make progress in decoding. */
union {
Expand Down Expand Up @@ -318,6 +328,7 @@ struct BrotliDecoderStateStruct {
unsigned int should_wrap_ringbuffer : 1;
unsigned int canny_ringbuffer_allocation : 1;
unsigned int large_window : 1;
unsigned int save_info_for_recompression : 1;
unsigned int size_nibbles : 8;
uint32_t window_bits;

Expand Down
Loading