From e6ab174e20137d62b33e4373f5fbd3c435948036 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Tue, 14 May 2024 20:56:49 +0800 Subject: [PATCH] GH-41329: [C++][Gandiva] Fix gandiva cache size env var (#41330) ### Rationale for this change Gandiva cache size validity checks are not robust enough (the negativity test is broken), and they are not currently tested. ### What changes are included in this PR? 1. Fix checking gandiva cache size env var. 2. Make cache size static so it only gets evaluated once. 3. Add test cases. 4. Enrich the description in the document about this env var. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: #41329 Lead-authored-by: Ruoxi Sun Co-authored-by: Rossi Sun Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/gandiva/cache.cc | 47 +++++++++++++++--------- cpp/src/gandiva/cache.h | 15 +++++++- cpp/src/gandiva/cache_test.cc | 68 ++++++++++++++++++++++++++++++++++- docs/source/cpp/env_vars.rst | 4 +++ 4 files changed, 116 insertions(+), 18 deletions(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index a1333ccdc5d43..2358b08c82424 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -20,26 +20,41 @@ #include "arrow/result.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace gandiva { -static const size_t DEFAULT_CACHE_SIZE = 5000; - -int GetCapacity() { - size_t capacity = DEFAULT_CACHE_SIZE; - auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); - if (maybe_env_cache_size.ok()) { - const auto env_cache_size = *std::move(maybe_env_cache_size); - if (!env_cache_size.empty()) { - capacity = std::atol(env_cache_size.c_str()); - if (capacity <= 0) { - ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " - << "Using default cache size: " << DEFAULT_CACHE_SIZE; - capacity = DEFAULT_CACHE_SIZE; - } - } +constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE"; +constexpr auto kDefaultCacheSize = 5000; + +namespace internal { +int GetCacheCapacityFromEnvVar() { + auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar); + if (!maybe_env_value.ok()) { + return kDefaultCacheSize; + } + const auto env_value = *std::move(maybe_env_value); + if (env_value.empty()) { + return kDefaultCacheSize; + } + int capacity = 0; + bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>( + env_value.c_str(), env_value.size(), &capacity); + if (!ok || capacity <= 0) { + ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar + << ". Using default cache size: " << kDefaultCacheSize; + return kDefaultCacheSize; } - return static_cast(capacity); + return capacity; +} +} // namespace internal + +// Deprecated in 17.0.0. Use GetCacheCapacity instead. +int GetCapacity() { return GetCacheCapacity(); } + +int GetCacheCapacity() { + static const int capacity = internal::GetCacheCapacityFromEnvVar(); + return capacity; } void LogCacheSize(size_t capacity) { diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index 7cff9b02692ae..c19dbb7a0e30e 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -20,14 +20,27 @@ #include #include +#include "arrow/util/macros.h" #include "gandiva/lru_cache.h" #include "gandiva/visibility.h" namespace gandiva { +namespace internal { +// Only called once by GetCacheCapacity(). +// Do the actual work of getting the cache capacity from env var. +// Also makes the testing easier. +GANDIVA_EXPORT +int GetCacheCapacityFromEnvVar(); +} // namespace internal + +ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.") GANDIVA_EXPORT int GetCapacity(); +GANDIVA_EXPORT +int GetCacheCapacity(); + GANDIVA_EXPORT void LogCacheSize(size_t capacity); @@ -36,7 +49,7 @@ class Cache { public: explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); } - Cache() : Cache(GetCapacity()) {} + Cache() : Cache(GetCacheCapacity()) {} ValueType GetObjectCode(const KeyType& cache_key) { std::optional result; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index a146707079fa6..96cf4a12e587a 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -16,10 +16,14 @@ // under the License. #include "gandiva/cache.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" #include namespace gandiva { + class TestCacheKey { public: explicit TestCacheKey(int value) : value_(value) {} @@ -38,5 +42,67 @@ TEST(TestCache, TestGetPut) { ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); } -TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); } +namespace { +constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE"; +constexpr auto default_cache_capacity = 5000; +} // namespace + +TEST(TestCache, TestGetCacheCapacityDefault) { + ASSERT_EQ(GetCacheCapacity(), default_cache_capacity); +} + +TEST(TestCache, TestGetCacheCapacityEnvVar) { + using ::arrow::EnvVarGuard; + + // Empty. + { + EnvVarGuard guard(cache_capacity_env_var, ""); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Non-number. + { + EnvVarGuard guard(cache_capacity_env_var, "invalid"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Number with invalid suffix. + { + EnvVarGuard guard(cache_capacity_env_var, "42MB"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Valid positive number. + { + EnvVarGuard guard(cache_capacity_env_var, "42"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42); + } + + // Int max. + { + auto str = std::to_string(std::numeric_limits::max()); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits::max()); + } + + // Zero. + { + EnvVarGuard guard(cache_capacity_env_var, "0"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Negative number. + { + EnvVarGuard guard(cache_capacity_env_var, "-1"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Over int max. + { + auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } +} + } // namespace gandiva diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 116c151824c75..0a082b0a5d859 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -181,6 +181,10 @@ that changing their value later will have an effect. The number of entries to keep in the Gandiva JIT compilation cache. The cache is in-memory and does not persist across processes. + The default cache size is 5000. The value of this environment variable + should be a positive integer and should not exceed the maximum value + of int32. Otherwise the default value is used. + .. envvar:: HADOOP_HOME The path to the Hadoop installation.