Skip to content

Commit

Permalink
Merge branch 'main' into patch-2
Browse files Browse the repository at this point in the history
  • Loading branch information
kou authored Dec 18, 2024
2 parents 8ab5f24 + c5d756e commit 2de2a0a
Show file tree
Hide file tree
Showing 55 changed files with 1,757 additions and 501 deletions.
1 change: 1 addition & 0 deletions ci/docker/python-wheel-manylinux.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ RUN --mount=type=secret,id=github_repository_owner \
--x-feature=flight \
--x-feature=gcs \
--x-feature=json \
--x-feature=orc \
--x-feature=parquet \
--x-feature=s3 && \
rm -rf ~/.config/NuGet/
Expand Down
26 changes: 26 additions & 0 deletions ci/docker/ubuntu-24.04-verify-rc.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

ARG arch=amd64
FROM ${arch}/ubuntu:24.04

ENV DEBIAN_FRONTEND=noninteractive
COPY dev/release/setup-ubuntu.sh /
RUN /setup-ubuntu.sh && \
rm /setup-ubuntu.sh && \
apt-get clean && \
rm -rf /var/lib/apt/lists*
1 change: 0 additions & 1 deletion ci/scripts/python_wheel_macos_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ cmake \
-DCMAKE_INSTALL_PREFIX=${build_dir}/install \
-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \
-DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \
-DORC_SOURCE=BUNDLED \
-DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \
-DVCPKG_MANIFEST_MODE=OFF \
-DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \
Expand Down
1 change: 0 additions & 1 deletion ci/scripts/python_wheel_manylinux_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ cmake \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \
-DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \
-DORC_SOURCE=BUNDLED \
-DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \
-DVCPKG_MANIFEST_MODE=OFF \
-DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \
Expand Down
4 changes: 2 additions & 2 deletions cpp/examples/arrow/parquet_read_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

arrow::Status ReadFullFile(std::string path_to_file) {
// #include "arrow/io/api.h"
// #include "arrow/parquet/arrow/reader.h"
// #include "parquet/arrow/reader.h"

arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::RandomAccessFile> input;
Expand All @@ -44,7 +44,7 @@ arrow::Status ReadFullFile(std::string path_to_file) {

arrow::Status ReadInBatches(std::string path_to_file) {
// #include "arrow/io/api.h"
// #include "arrow/parquet/arrow/reader.h"
// #include "parquet/arrow/reader.h"

arrow::MemoryPool* pool = arrow::default_memory_pool();

Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/acero/ArrowAceroConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

include(CMakeFindDependencyMacro)
find_dependency(Arrow)
find_dependency(Parquet)

include("${CMAKE_CURRENT_LIST_DIR}/ArrowAceroTargets.cmake")

Expand Down
121 changes: 121 additions & 0 deletions cpp/src/arrow/acero/hash_join_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "arrow/acero/hash_join.h"
#include "arrow/acero/hash_join_node.h"
#include "arrow/acero/options.h"
#include "arrow/acero/swiss_join_internal.h"
#include "arrow/acero/test_util_internal.h"
#include "arrow/acero/util.h"
#include "arrow/api.h"
Expand Down Expand Up @@ -365,6 +366,21 @@ static void BM_HashJoinBasic_ComplexResidualFilter(benchmark::State& st,

HashJoinBasicBenchmarkImpl(st, settings);
}

static void BM_HashJoinBasic_HeavyBuildPayload(benchmark::State& st) {
BenchmarkSettings settings;
settings.build_payload_types = {boolean(), fixed_size_binary(64), utf8(),
boolean(), fixed_size_binary(64), utf8()};
settings.probe_payload_types = {int32()};
settings.null_percentage = 0.5;
settings.cardinality = 1.0 / 16.0;
settings.num_build_batches = static_cast<int>(st.range(0));
settings.num_probe_batches = settings.num_build_batches;
settings.var_length_min = 64;
settings.var_length_max = 128;

HashJoinBasicBenchmarkImpl(st, settings);
}
#endif

std::vector<int64_t> hashtable_krows = benchmark::CreateRange(1, 4096, 8);
Expand Down Expand Up @@ -622,6 +638,10 @@ BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Full Outer",
JoinType::FULL_OUTER)
->ArgNames(complex_residual_filter_argnames)
->ArgsProduct(complex_residual_filter_args);

BENCHMARK(BM_HashJoinBasic_HeavyBuildPayload)
->ArgNames({"HashTable krows"})
->ArgsProduct({benchmark::CreateRange(1, 512, 8)});
#else

BENCHMARK_CAPTURE(BM_HashJoinBasic_KeyTypes, "{int32}", {int32()})
Expand All @@ -640,5 +660,106 @@ BENCHMARK(BM_HashJoinBasic_ProbeParallelism)

#endif // ARROW_BUILD_DETAILED_BENCHMARKS

void RowArrayDecodeBenchmark(benchmark::State& st, const std::shared_ptr<Schema>& schema,
int column_to_decode) {
auto batches = MakeRandomBatches(schema, 1, std::numeric_limits<uint16_t>::max());
const auto& batch = batches.batches[0];
RowArray rows;
std::vector<uint16_t> row_ids_encode(batch.length);
std::iota(row_ids_encode.begin(), row_ids_encode.end(), 0);
std::vector<KeyColumnArray> temp_column_arrays;
DCHECK_OK(rows.AppendBatchSelection(
default_memory_pool(), internal::CpuInfo::GetInstance()->hardware_flags(), batch, 0,
static_cast<int>(batch.length), static_cast<int>(batch.length),
row_ids_encode.data(), temp_column_arrays));
std::vector<uint32_t> row_ids_decode(batch.length);
// Create a random access pattern to simulate hash join.
std::default_random_engine gen(42);
std::uniform_int_distribution<uint32_t> dist(0,
static_cast<uint32_t>(batch.length - 1));
std::transform(row_ids_decode.begin(), row_ids_decode.end(), row_ids_decode.begin(),
[&](uint32_t) { return dist(gen); });

for (auto _ : st) {
ResizableArrayData column;
// Allocate at least 8 rows for the convenience of SIMD decoding.
int log_num_rows_min = std::max(3, bit_util::Log2(batch.length));
DCHECK_OK(column.Init(batch[column_to_decode].type(), default_memory_pool(),
log_num_rows_min));
DCHECK_OK(rows.DecodeSelected(&column, column_to_decode,
static_cast<int>(batch.length), row_ids_decode.data(),
default_memory_pool()));
}
st.SetItemsProcessed(st.iterations() * batch.length);
}

static void BM_RowArray_Decode(benchmark::State& st,
const std::shared_ptr<DataType>& type) {
SchemaBuilder schema_builder;
DCHECK_OK(schema_builder.AddField(field("", type)));
auto schema = *schema_builder.Finish();
RowArrayDecodeBenchmark(st, schema, 0);
}

BENCHMARK_CAPTURE(BM_RowArray_Decode, "boolean", boolean());
BENCHMARK_CAPTURE(BM_RowArray_Decode, "int8", int8());
BENCHMARK_CAPTURE(BM_RowArray_Decode, "int16", int16());
BENCHMARK_CAPTURE(BM_RowArray_Decode, "int32", int32());
BENCHMARK_CAPTURE(BM_RowArray_Decode, "int64", int64());

static void BM_RowArray_DecodeFixedSizeBinary(benchmark::State& st) {
int fixed_size = static_cast<int>(st.range(0));
SchemaBuilder schema_builder;
DCHECK_OK(schema_builder.AddField(field("", fixed_size_binary(fixed_size))));
auto schema = *schema_builder.Finish();
RowArrayDecodeBenchmark(st, schema, 0);
}

BENCHMARK(BM_RowArray_DecodeFixedSizeBinary)
->ArgNames({"fixed_size"})
->ArgsProduct({{3, 5, 6, 7, 9, 16, 42}});

static void BM_RowArray_DecodeBinary(benchmark::State& st) {
int max_length = static_cast<int>(st.range(0));
std::unordered_map<std::string, std::string> metadata;
metadata["max_length"] = std::to_string(max_length);
SchemaBuilder schema_builder;
DCHECK_OK(schema_builder.AddField(field("", utf8(), key_value_metadata(metadata))));
auto schema = *schema_builder.Finish();
RowArrayDecodeBenchmark(st, schema, 0);
}

BENCHMARK(BM_RowArray_DecodeBinary)
->ArgNames({"max_length"})
->ArgsProduct({{32, 64, 128}});

static void BM_RowArray_DecodeOneOfColumns(benchmark::State& st,
std::vector<std::shared_ptr<DataType>> types) {
SchemaBuilder schema_builder;
for (const auto& type : types) {
DCHECK_OK(schema_builder.AddField(field("", type)));
}
auto schema = *schema_builder.Finish();
int column_to_decode = static_cast<int>(st.range(0));
RowArrayDecodeBenchmark(st, schema, column_to_decode);
}

const std::vector<std::shared_ptr<DataType>> fixed_length_row_column_types{
boolean(), int32(), fixed_size_binary(64)};
BENCHMARK_CAPTURE(BM_RowArray_DecodeOneOfColumns,
"fixed_length_row:{boolean,int32,fixed_size_binary(64)}",
fixed_length_row_column_types)
->ArgNames({"column"})
->ArgsProduct(
{benchmark::CreateDenseRange(0, fixed_length_row_column_types.size() - 1, 1)});

const std::vector<std::shared_ptr<DataType>> var_length_row_column_types{
boolean(), int32(), utf8(), utf8()};
BENCHMARK_CAPTURE(BM_RowArray_DecodeOneOfColumns,
"var_length_row:{boolean,int32,utf8,utf8}", var_length_row_column_types)
->ArgNames({"column"})
->ArgsProduct({benchmark::CreateDenseRange(0, var_length_row_column_types.size() - 1,
1)});

} // namespace acero
} // namespace arrow
Loading

0 comments on commit 2de2a0a

Please sign in to comment.