Skip to content

Commit

Permalink
Merge branch 'main' into parquet/support-read-bf-length
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Nov 29, 2023
2 parents 3a87383 + 1d904d6 commit bf32a8f
Show file tree
Hide file tree
Showing 200 changed files with 4,468 additions and 954 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/comment_bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
python-version: 3.8
- name: Install Archery and Crossbow dependencies
run: pip install -e arrow/dev/archery[bot]
- name: Handle Github comment event
- name: Handle GitHub comment event
env:
ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dev_pr/issue_check.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ async function commentNotStartedTicket(github, context, pullRequestNumber) {
}

/**
* Assigns the Github Issue to the PR creator.
* Assigns the GitHub Issue to the PR creator.
*
* @param {Object} github
* @param {Object} context
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/issue_bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
"per_page": 100,
});
// this removes non-existent labels
// this removes nonexistent labels
component_labels = component_labels.filter(
label => repo_labels.data.some(repo_label => repo_label.name === label)
);
Expand Down
2 changes: 1 addition & 1 deletion ci/scripts/PKGBUILD
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
_realname=arrow
pkgbase=mingw-w64-${_realname}
pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}"
pkgver=14.0.0.9000
pkgver=14.0.1.9000
pkgrel=8000
pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)"
arch=("any")
Expand Down
3 changes: 3 additions & 0 deletions ci/scripts/go_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ set -ex

source_dir=${1}/go

# Need "all=" as per https://github.com/golang/go/issues/42131#issuecomment-713917379
export GOFLAGS="${GOFLAGS} -gcflags=all=-d=checkptr"

pushd ${source_dir}/arrow

if [[ -n "${ARROW_GO_TESTCGO}" ]]; then
Expand Down
2 changes: 2 additions & 0 deletions ci/scripts/go_cgo_python_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ set -ex

source_dir=${1}/go

export GOFLAGS="${GOFLAGS} -gcflags=all=-d=checkptr"

pushd ${source_dir}/arrow/cdata/test

case "$(uname)" in
Expand Down
15 changes: 15 additions & 0 deletions cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,21 @@
],
"displayName": "Benchmarking build with with everything enabled",
"cacheVariables": {}
},
{
"name": "fuzzing",
"inherits": "base",
"displayName": "Debug build with IPC and Parquet fuzzing targets",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"CMAKE_C_COMPILER": "clang",
"CMAKE_CXX_COMPILER": "clang++",
"ARROW_USE_ASAN": "ON",
"ARROW_USE_UBSAN": "ON",
"ARROW_IPC": "ON",
"ARROW_PARQUET": "ON",
"ARROW_FUZZING": "ON"
}
}
]
}
3 changes: 3 additions & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5063,6 +5063,9 @@ if(ARROW_S3)
string(APPEND ARROW_PC_REQUIRES_PRIVATE " libcurl")
endif()
string(APPEND ARROW_PC_REQUIRES_PRIVATE " openssl")
if(APPLE)
string(APPEND ARROW_PC_LIBS_PRIVATE " -framework Security")
endif()
endif()
endif()

Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ set(ARROW_SRCS
util/debug.cc
util/decimal.cc
util/delimiting.cc
util/dict_util.cc
util/float16.cc
util/formatting.cc
util/future.cc
Expand Down Expand Up @@ -598,7 +599,7 @@ if(ARROW_BUILD_BUNDLED_DEPENDENCIES)
IMPORTED_LOCATION)
install(FILES ${arrow_bundled_dependencies_path} ${INSTALL_IS_OPTIONAL}
DESTINATION ${CMAKE_INSTALL_LIBDIR})
string(APPEND ARROW_PC_LIBS_PRIVATE " -larrow_bundled_dependencies")
string(PREPEND ARROW_PC_LIBS_PRIVATE " -larrow_bundled_dependencies")
list(INSERT ARROW_STATIC_INSTALL_INTERFACE_LIBS 0 "Arrow::arrow_bundled_dependencies")
endif()

Expand Down
57 changes: 57 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,22 @@ class TestArray : public ::testing::Test {
MemoryPool* pool_;
};

void CheckDictionaryNullCount(const std::shared_ptr<DataType>& dict_type,
const std::string& input_dictionary_json,
const std::string& input_index_json,
const int64_t& expected_null_count,
const int64_t& expected_logical_null_count,
bool expected_may_have_nulls,
bool expected_may_have_logical_nulls) {
std::shared_ptr<arrow::Array> arr =
DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json);

ASSERT_EQ(arr->null_count(), expected_null_count);
ASSERT_EQ(arr->ComputeLogicalNullCount(), expected_logical_null_count);
ASSERT_EQ(arr->data()->MayHaveNulls(), expected_may_have_nulls);
ASSERT_EQ(arr->data()->MayHaveLogicalNulls(), expected_may_have_logical_nulls);
}

TEST_F(TestArray, TestNullCount) {
// These are placeholders
auto data = std::make_shared<Buffer>(nullptr, 0);
Expand Down Expand Up @@ -127,6 +143,37 @@ TEST_F(TestArray, TestNullCount) {
ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls());

// Dictionary type
std::shared_ptr<arrow::DataType> type;
std::shared_ptr<arrow::DataType> dict_type;

for (const auto& index_type : all_dictionary_index_types()) {
ARROW_SCOPED_TRACE("index_type = ", index_type->ToString());

type = boolean();
dict_type = dictionary(index_type, type);
// no null value
CheckDictionaryNullCount(dict_type, "[]", "[]", 0, 0, false, false);
CheckDictionaryNullCount(dict_type, "[true, false]", "[0, 1, 0]", 0, 0, false, false);

// only indices contain null value
CheckDictionaryNullCount(dict_type, "[true, false]", "[null, 0, 1]", 1, 1, true,
true);
CheckDictionaryNullCount(dict_type, "[true, false]", "[null, null]", 2, 2, true,
true);

// only dictionary contains null value
CheckDictionaryNullCount(dict_type, "[null, true]", "[]", 0, 0, false, true);
CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0]", 0, 2, false,
true);

// both indices and dictionary contain null value
CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0, null]", 1, 3,
true, true);
CheckDictionaryNullCount(dict_type, "[null, true, null, false]", "[null, 1, 0, 2, 3]",
1, 3, true, true);
}
}

TEST_F(TestArray, TestSlicePreservesAllNullCount) {
Expand All @@ -137,6 +184,16 @@ TEST_F(TestArray, TestSlicePreservesAllNullCount) {
Int32Array arr(/*length=*/100, data, null_bitmap,
/*null_count*/ 100);
EXPECT_EQ(arr.Slice(1, 99)->data()->null_count, arr.Slice(1, 99)->length());

// Dictionary type
std::shared_ptr<arrow::DataType> dict_type = dictionary(int64(), boolean());
std::shared_ptr<arrow::Array> dict_arr =
DictArrayFromJSON(dict_type, /*indices=*/"[null, 0, 0, 0, 0, 0, 1, 2, 0, 0]",
/*dictionary=*/"[null, true, false]");
ASSERT_EQ(dict_arr->null_count(), 1);
ASSERT_EQ(dict_arr->ComputeLogicalNullCount(), 8);
ASSERT_EQ(dict_arr->Slice(2, 8)->null_count(), 0);
ASSERT_EQ(dict_arr->Slice(2, 8)->ComputeLogicalNullCount(), 6);
}

TEST_F(TestArray, TestLength) {
Expand Down
14 changes: 13 additions & 1 deletion cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/dict_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ree_util.h"
Expand Down Expand Up @@ -93,6 +94,10 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

bool DictionaryMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

BufferSpan PackVariadicBuffers(util::span<const std::shared_ptr<Buffer>> buffers) {
return {const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(buffers.data())),
static_cast<int64_t>(buffers.size() * sizeof(std::shared_ptr<Buffer>))};
Expand Down Expand Up @@ -174,7 +179,7 @@ int64_t ArrayData::GetNullCount() const {
}

int64_t ArrayData::ComputeLogicalNullCount() const {
if (this->buffers[0]) {
if (this->buffers[0] && this->type->id() != Type::DICTIONARY) {
return GetNullCount();
}
return ArraySpan(*this).ComputeLogicalNullCount();
Expand Down Expand Up @@ -542,6 +547,9 @@ int64_t ArraySpan::ComputeLogicalNullCount() const {
if (t == Type::RUN_END_ENCODED) {
return ree_util::LogicalNullCount(*this);
}
if (t == Type::DICTIONARY) {
return dict_util::LogicalNullCount(*this);
}
return GetNullCount();
}

Expand Down Expand Up @@ -639,6 +647,10 @@ bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const {
return ree_util::ValuesArray(*this).MayHaveLogicalNulls();
}

bool ArraySpan::DictionaryMayHaveLogicalNulls() const {
return this->GetNullCount() != 0 || this->dictionary().GetNullCount() != 0;
}

// ----------------------------------------------------------------------
// Implement internal::GetArrayView

Expand Down
14 changes: 11 additions & 3 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,15 @@ struct ArrayData;

namespace internal {
// ----------------------------------------------------------------------
// Null handling for types without a validity bitmap
// Null handling for types without a validity bitmap and the dictionary type

ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i);

ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
} // namespace internal

// When slicing, we do not know the null count of the sliced range without
Expand Down Expand Up @@ -280,7 +281,7 @@ struct ARROW_EXPORT ArrayData {

/// \brief Return true if the validity bitmap may have 0's in it, or if the
/// child arrays (in the case of types without a validity bitmap) may have
/// nulls
/// nulls, or if the dictionary of dictionay array may have nulls.
///
/// This is not a drop-in replacement for MayHaveNulls, as historically
/// MayHaveNulls() has been used to check for the presence of a validity
Expand Down Expand Up @@ -325,6 +326,9 @@ struct ARROW_EXPORT ArrayData {
if (t == Type::RUN_END_ENCODED) {
return internal::RunEndEncodedMayHaveLogicalNulls(*this);
}
if (t == Type::DICTIONARY) {
return internal::DictionaryMayHaveLogicalNulls(*this);
}
return null_count.load() != 0;
}

Expand Down Expand Up @@ -505,7 +509,7 @@ struct ARROW_EXPORT ArraySpan {

/// \brief Return true if the validity bitmap may have 0's in it, or if the
/// child arrays (in the case of types without a validity bitmap) may have
/// nulls
/// nulls, or if the dictionary of dictionay array may have nulls.
///
/// \see ArrayData::MayHaveLogicalNulls
bool MayHaveLogicalNulls() const {
Expand All @@ -519,6 +523,9 @@ struct ARROW_EXPORT ArraySpan {
if (t == Type::RUN_END_ENCODED) {
return RunEndEncodedMayHaveLogicalNulls();
}
if (t == Type::DICTIONARY) {
return DictionaryMayHaveLogicalNulls();
}
return null_count != 0;
}

Expand Down Expand Up @@ -560,6 +567,7 @@ struct ARROW_EXPORT ArraySpan {

bool UnionMayHaveLogicalNulls() const;
bool RunEndEncodedMayHaveLogicalNulls() const;
bool DictionaryMayHaveLogicalNulls() const;
};

namespace internal {
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

// macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h.
// No other BSD seems to do so. The name is used as an identifier in MemAllocation enum.
#if defined(__APPLE__) && defined(PREALLOCATE)
#undef PREALLOCATE
#endif

namespace arrow {
namespace compute {

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/file_json_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class JsonScanMixin {

// Use a reduced number of rows in valgrind to avoid timeouts.
#ifndef ARROW_VALGRIND
constexpr static int64_t kTestMaxNumRows = json::kMaxParserNumRows;
constexpr static int64_t kTestMaxNumRows = (1UL << 17);
#else
constexpr static int64_t kTestMaxNumRows = 1024;
#endif
Expand Down
Loading

0 comments on commit bf32a8f

Please sign in to comment.