Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-37378: [C++] Add A Dictionary Compaction Function For DictionaryArray #37418

Merged
merged 61 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from 56 commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
1dadf59
compaction init
R-JunmingChen Aug 28, 2023
cf474f5
fix bug
R-JunmingChen Aug 28, 2023
aec9ca2
add finsih draft
R-JunmingChen Aug 31, 2023
e522841
lint
R-JunmingChen Aug 31, 2023
9853df3
compaction draft done
R-JunmingChen Aug 31, 2023
23eed73
lint
R-JunmingChen Sep 1, 2023
d1af639
optimized
R-JunmingChen Sep 1, 2023
83a38e1
Update vector_hash.cc
R-JunmingChen Sep 1, 2023
dafd8c8
Update vector_hash.cc
R-JunmingChen Sep 1, 2023
cf29aae
Update vector_hash.cc
R-JunmingChen Sep 1, 2023
3fa7abc
Update vector_hash.cc
R-JunmingChen Sep 1, 2023
74e4fb1
lint
R-JunmingChen Sep 1, 2023
ef1f335
lint
R-JunmingChen Sep 1, 2023
354b5b0
add a basic test
R-JunmingChen Sep 4, 2023
38a7bcc
empty situation
R-JunmingChen Sep 4, 2023
93f4029
full boolean test
R-JunmingChen Sep 4, 2023
1fe45d0
optimize
R-JunmingChen Sep 5, 2023
46959c2
registry
R-JunmingChen Sep 6, 2023
a3dbf0f
move to vector_dictionary
R-JunmingChen Sep 7, 2023
c6fd4f2
rename
R-JunmingChen Sep 7, 2023
73cc864
refine doc
R-JunmingChen Sep 8, 2023
092a38c
lint
R-JunmingChen Sep 8, 2023
dc6244b
lint2
R-JunmingChen Sep 8, 2023
529246d
Update cpp/src/arrow/compute/kernels/vector_dictionary.cc
R-JunmingChen Sep 9, 2023
a826666
add a slice test
R-JunmingChen Sep 9, 2023
83dc6fd
Merge branch 'ARROW-37378' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Sep 9, 2023
8d2129a
indice
R-JunmingChen Sep 9, 2023
f39c118
update doc
R-JunmingChen Sep 9, 2023
d9f3ebf
Update cpp/src/arrow/compute/kernels/vector_dictionary.cc
R-JunmingChen Sep 22, 2023
4ef8457
Update cpp/src/arrow/compute/kernels/vector_dictionary.cc
R-JunmingChen Sep 22, 2023
8901bd3
Update cpp/src/arrow/compute/kernels/vector_dictionary.cc
R-JunmingChen Sep 22, 2023
456c217
Update cpp/src/arrow/compute/kernels/vector_dictionary.cc
R-JunmingChen Sep 22, 2023
69ca6a0
fix comments
R-JunmingChen Sep 22, 2023
b59fe45
Merge branch 'ARROW-37378' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Sep 22, 2023
ffda004
rename
R-JunmingChen Sep 22, 2023
0ed463a
remove redundant vector
R-JunmingChen Sep 22, 2023
ea88395
check indices
R-JunmingChen Sep 26, 2023
0e84226
Update cpp/src/arrow/compute/kernels/vector_dictionary.cc
R-JunmingChen Sep 26, 2023
126761e
Update cpp/src/arrow/compute/kernels/vector_dictionary.cc
R-JunmingChen Sep 26, 2023
a8c9642
index
R-JunmingChen Sep 26, 2023
f6da942
Merge branch 'ARROW-37378' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Sep 26, 2023
065cee4
exec
R-JunmingChen Sep 26, 2023
ce9876b
rename
R-JunmingChen Sep 26, 2023
1384fe9
remove \n
R-JunmingChen Sep 26, 2023
ea3c416
VisitArraySpanInline
R-JunmingChen Sep 26, 2023
e236e96
14
R-JunmingChen Sep 26, 2023
4be9f7b
return
R-JunmingChen Sep 26, 2023
68137d4
fix bug
R-JunmingChen Sep 26, 2023
731b583
dict_array_compact
R-JunmingChen Oct 5, 2023
443ce45
pass basic test
R-JunmingChen Oct 5, 2023
90f27f1
lint
R-JunmingChen Oct 5, 2023
c8a0480
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Oct 10, 2023
c48e57e
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Oct 10, 2023
16854e2
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Oct 10, 2023
d74f17d
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Oct 10, 2023
a7d5c60
Vistor
R-JunmingChen Oct 10, 2023
168a891
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Oct 11, 2023
0cffac0
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Oct 11, 2023
c1a4635
unsafe append
R-JunmingChen Oct 11, 2023
1665144
Delete function
R-JunmingChen Oct 11, 2023
4562d0f
out of bound
R-JunmingChen Oct 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,8 @@ if(ARROW_COMPUTE)
compute/kernels/vector_replace.cc
compute/kernels/vector_run_end_encode.cc
compute/kernels/vector_select_k.cc
compute/kernels/vector_sort.cc)
compute/kernels/vector_sort.cc
compute/kernels/vector_dictionary.cc)

append_runtime_avx2_src(compute/kernels/aggregate_basic_avx2.cc)
append_runtime_avx512_src(compute/kernels/aggregate_basic_avx512.cc)
Expand Down
113 changes: 113 additions & 0 deletions cpp/src/arrow/array/array_dict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "arrow/array/util.h"
#include "arrow/buffer.h"
#include "arrow/chunked_array.h"
#include "arrow/compute/api.h"
#include "arrow/datum.h"
#include "arrow/status.h"
#include "arrow/table.h"
Expand Down Expand Up @@ -211,6 +212,105 @@ Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
return out_data;
}

struct CompactTransposeMapVistor {
const std::shared_ptr<ArrayData>& data;
arrow::MemoryPool* pool;
std::unique_ptr<Buffer> output_map;
std::shared_ptr<Array> out_compact_dictionary;

template <typename IndexArrowType>
Status CompactTransposeMapImpl() {
int64_t index_length = data->length;
int64_t dict_length = data->dictionary->length;
if (dict_length == 0) {
output_map = nullptr;
out_compact_dictionary = nullptr;
return Status::OK();
} else if (index_length == 0) {
ARROW_ASSIGN_OR_RAISE(out_compact_dictionary,
MakeEmptyArray(data->dictionary->type, pool));
ARROW_ASSIGN_OR_RAISE(output_map, AllocateBuffer(0, pool))
return Status::OK();
}

using CType = typename IndexArrowType::c_type;
const CType* indices_data = data->GetValues<CType>(1);
std::vector<bool> dict_used(dict_length, false);
CType dict_len = static_cast<CType>(dict_length);
int64_t dict_used_count = 0;
for (int64_t i = 0; i < index_length; i++) {
if (data->IsNull(i)) {
continue;
}

CType current_index = indices_data[i];
if (current_index < 0 || current_index >= dict_len) {
return Status::IndexError(
"Index out of bounds while compacting dictionary array: ", current_index,
"(dictionary is ", dict_length, " long) at position ", i);
} else if (!dict_used[current_index]) {
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
dict_used[current_index] = true;
dict_used_count++;

if (dict_used_count == dict_length) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking here enables skipping the rest of the dictionary, which is good. However I think it'd also be useful to detect usage of only a slice of the dictionary. If you'd prefer not to handle that in this PR, please write a follow up issue

Copy link
Contributor Author

@R-JunmingChen R-JunmingChen Oct 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @bkietz, do you mean if we find it just use only a slice of dictionay, we use slice() instead of Take? I prefer to leave it as an new issue. Since we have another PR which is wating for this PR to be merged.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just use Slice outside the Compact to handling this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I write a follow up issue #38247 to handle the optimization

// The dictionary is already compact, so just return here
output_map = nullptr;
out_compact_dictionary = nullptr;
return Status::OK();
}
}
}

using BuilderType = NumericBuilder<IndexArrowType>;
using arrow::compute::Take;
using arrow::compute::TakeOptions;
BuilderType dict_indices_builder(pool);
ARROW_ASSIGN_OR_RAISE(output_map,
AllocateBuffer(dict_length * sizeof(int32_t), pool));
int32_t* output_map_raw = reinterpret_cast<int32_t*>(output_map->mutable_data());
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
int32_t current_index = 0;
for (CType i = 0; i < dict_len; i++) {
if (dict_used[i]) {
ARROW_RETURN_NOT_OK(dict_indices_builder.Append(i));
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
output_map_raw[i] = current_index;
current_index++;
} else {
output_map_raw[i] = -1;
}
}
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> compacted_dict_indices,
dict_indices_builder.Finish());
ARROW_ASSIGN_OR_RAISE(auto compacted_dict_res,
Take(Datum(data->dictionary), compacted_dict_indices,
TakeOptions::NoBoundsCheck()));
out_compact_dictionary = compacted_dict_res.make_array();
return Status::OK();
}

bkietz marked this conversation as resolved.
Show resolved Hide resolved
template <typename Type>
enable_if_integer<Type, Status> Visit(const Type&) {
return CompactTransposeMapImpl<Type>();
}

Status Visit(const DataType& type) {
return Status::TypeError("Expected an Index Type of Int or UInt");
}
};

Result<std::unique_ptr<Buffer>> CompactTransposeMap(
const std::shared_ptr<ArrayData>& data, MemoryPool* pool,
std::shared_ptr<Array>& out_compact_dictionary) {
if (data->type->id() != Type::DICTIONARY) {
return Status::TypeError("Expected dictionary type");
}

const auto& dict_type = checked_cast<const DictionaryType&>(*data->type);
CompactTransposeMapVistor vistor{data, pool, nullptr, nullptr};
RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor));

out_compact_dictionary = vistor.out_compact_dictionary;
return std::move(vistor.output_map);
}
} // namespace

Result<std::shared_ptr<Array>> DictionaryArray::Transpose(
Expand All @@ -222,6 +322,19 @@ Result<std::shared_ptr<Array>> DictionaryArray::Transpose(
return MakeArray(std::move(transposed));
}

Result<std::shared_ptr<Array>> DictionaryArray::Compact(MemoryPool* pool) const {
std::shared_ptr<Array> compact_dictionary;
ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Buffer> transpose_map,
CompactTransposeMap(this->data_, pool, compact_dictionary));

if (transpose_map == nullptr) {
return std::make_shared<DictionaryArray>(this->data_);
} else {
return this->Transpose(this->type(), compact_dictionary,
transpose_map->data_as<int32_t>(), pool);
}
}

// ----------------------------------------------------------------------
// Dictionary unification

Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/array/array_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ class ARROW_EXPORT DictionaryArray : public Array {
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;

Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const;

/// \brief Determine whether dictionary arrays may be compared without unification
bool CanCompareIndices(const DictionaryArray& other) const;

Expand Down
65 changes: 65 additions & 0 deletions cpp/src/arrow/array/array_dict_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1428,6 +1428,71 @@ TEST(TestDictionary, IndicesArray) {
ASSERT_OK(arr->indices()->ValidateFull());
}

void CheckDictionaryCompact(const std::shared_ptr<DataType>& dict_type,
const std::string& input_dictionary_json,
const std::string& input_index_json,
const std::string& expected_dictionary_json,
const std::string& expected_index_json) {
auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json);
const DictionaryArray& input_ref = checked_cast<const DictionaryArray&>(*input);

auto expected =
DictArrayFromJSON(dict_type, expected_index_json, expected_dictionary_json);

ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, input_ref.Compact());
AssertArraysEqual(*expected, *actual, /*verbose=*/true);
}

TEST(TestDictionary, Compact) {
std::shared_ptr<arrow::DataType> type;
std::shared_ptr<arrow::DataType> dict_type;

for (const auto& index_type : all_dictionary_index_types()) {
ARROW_SCOPED_TRACE("index_type = ", index_type->ToString());

type = boolean();
dict_type = dictionary(index_type, type);

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a test for invalid input type?

Copy link
Contributor Author

@R-JunmingChen R-JunmingChen Oct 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. We can't create a DictionaryArray with an invalid index type.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see a

  if (data->type->id() != Type::DICTIONARY) {
    return Status::TypeError("Expected dictionary type");
  }

Here, but seems it cannot be called

// input is compacted
CheckDictionaryCompact(dict_type, "[]", "[]", "[]", "[]");
CheckDictionaryCompact(dict_type, "[true, false]", "[0, 1, 0]", "[true, false]",
"[0, 1, 0]");
CheckDictionaryCompact(dict_type, "[true, null, false]", "[2, 1, 0]",
"[true, null, false]", "[2, 1, 0]");
CheckDictionaryCompact(dict_type, "[true, false]", "[0, null, 1, 0]", "[true, false]",
"[0, null, 1, 0]");
CheckDictionaryCompact(dict_type, "[true, null, false]", "[2, null, 1, 0]",
"[true, null, false]", "[2, null, 1, 0]");

// input isn't compacted
CheckDictionaryCompact(dict_type, "[null]", "[]", "[]", "[]");
CheckDictionaryCompact(dict_type, "[false]", "[null]", "[]", "[null]");
CheckDictionaryCompact(dict_type, "[true, false]", "[0]", "[true]", "[0]");
CheckDictionaryCompact(dict_type, "[true, false]", "[0, null]", "[true]",
"[0, null]");

// input isn't compacted && its indices needs to be adjusted
CheckDictionaryCompact(dict_type, "[true, null, false]", "[2, 1]", "[null, false]",
"[1, 0]");
CheckDictionaryCompact(dict_type, "[true, null, false]", "[2, null, 1]",
"[null, false]", "[1, null, 0]");

type = int64();
dict_type = dictionary(index_type, type);

// input isn't compacted && its indices needs to be adjusted
CheckDictionaryCompact(dict_type, "[3, 4, 7, 0, 12, 191, 21, 8]",
"[0, 2, 4, 4, 6, 4, 2, 0, 6]", "[3, 7, 12, 21]",
"[0, 1, 2, 2, 3, 2, 1, 0, 3]");
CheckDictionaryCompact(dict_type, "[3, 4, 7, 0, 12, 191, 21, 8]",
"[4, 6, 7, 7, 6, 4, 6, 6, 6]", "[12, 21, 8]",
"[0, 1, 2, 2, 1, 0, 1, 1, 1]");
CheckDictionaryCompact(dict_type, "[3, 4, 7, 0, 12, 191, 21, 8]",
"[7, 4, 7, 7, 7, 7, 4, 7, 7]", "[12, 8]",
"[1, 0, 1, 1, 1, 1, 0, 1, 1]");
}
}

TEST(TestDictionaryUnifier, Numeric) {
auto dict_ty = int64();

Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/compute/api_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,10 @@ Result<Datum> DictionaryEncode(const Datum& value, const DictionaryEncodeOptions
return CallFunction("dictionary_encode", {value}, &options, ctx);
}

Result<Datum> DictionaryCompact(const Datum& value, ExecContext* ctx) {
return CallFunction("dictionary_compact", {value}, ctx);
}

Result<Datum> RunEndEncode(const Datum& value, const RunEndEncodeOptions& options,
ExecContext* ctx) {
return CallFunction("run_end_encode", {value}, &options, ctx);
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,19 @@ Result<Datum> DictionaryEncode(
const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
ExecContext* ctx = NULLPTR);

/// \brief Compact a dictionary array
///
/// The output removes unused values in dictionary from the input.
///
/// \param[in] value array-like input, which should be a dictionary type.
/// \param[in] ctx the function execution context, optional
/// \return compacted dictionary array
///
/// \since 14.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> DictionaryCompact(const Datum& value, ExecContext* ctx = NULLPTR);

/// \brief Run-end-encode values in an array-like object
///
/// The returned run-end encoded type uses the same value type of the input and
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ add_arrow_compute_test(vector_test
vector_replace_test.cc
vector_run_end_encode_test.cc
select_k_test.cc
vector_dictionary_test.cc
EXTRA_LINK_LIBS
arrow_compute_kernels_testing)

Expand Down
Loading