diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index a1333ccdc5d43..2358b08c82424 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -20,26 +20,41 @@ #include "arrow/result.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace gandiva { -static const size_t DEFAULT_CACHE_SIZE = 5000; - -int GetCapacity() { - size_t capacity = DEFAULT_CACHE_SIZE; - auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); - if (maybe_env_cache_size.ok()) { - const auto env_cache_size = *std::move(maybe_env_cache_size); - if (!env_cache_size.empty()) { - capacity = std::atol(env_cache_size.c_str()); - if (capacity <= 0) { - ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " - << "Using default cache size: " << DEFAULT_CACHE_SIZE; - capacity = DEFAULT_CACHE_SIZE; - } - } +constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE"; +constexpr auto kDefaultCacheSize = 5000; + +namespace internal { +int GetCacheCapacityFromEnvVar() { + auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar); + if (!maybe_env_value.ok()) { + return kDefaultCacheSize; + } + const auto env_value = *std::move(maybe_env_value); + if (env_value.empty()) { + return kDefaultCacheSize; + } + int capacity = 0; + bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>( + env_value.c_str(), env_value.size(), &capacity); + if (!ok || capacity <= 0) { + ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar + << ". Using default cache size: " << kDefaultCacheSize; + return kDefaultCacheSize; } - return static_cast(capacity); + return capacity; +} +} // namespace internal + +// Deprecated in 17.0.0. Use GetCacheCapacity instead. +int GetCapacity() { return GetCacheCapacity(); } + +int GetCacheCapacity() { + static const int capacity = internal::GetCacheCapacityFromEnvVar(); + return capacity; } void LogCacheSize(size_t capacity) { diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index 7cff9b02692ae..c19dbb7a0e30e 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -20,14 +20,27 @@ #include #include +#include "arrow/util/macros.h" #include "gandiva/lru_cache.h" #include "gandiva/visibility.h" namespace gandiva { +namespace internal { +// Only called once by GetCacheCapacity(). +// Do the actual work of getting the cache capacity from env var. +// Also makes the testing easier. +GANDIVA_EXPORT +int GetCacheCapacityFromEnvVar(); +} // namespace internal + +ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.") GANDIVA_EXPORT int GetCapacity(); +GANDIVA_EXPORT +int GetCacheCapacity(); + GANDIVA_EXPORT void LogCacheSize(size_t capacity); @@ -36,7 +49,7 @@ class Cache { public: explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); } - Cache() : Cache(GetCapacity()) {} + Cache() : Cache(GetCacheCapacity()) {} ValueType GetObjectCode(const KeyType& cache_key) { std::optional result; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index a146707079fa6..96cf4a12e587a 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -16,10 +16,14 @@ // under the License. #include "gandiva/cache.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" #include namespace gandiva { + class TestCacheKey { public: explicit TestCacheKey(int value) : value_(value) {} @@ -38,5 +42,67 @@ TEST(TestCache, TestGetPut) { ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); } -TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); } +namespace { +constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE"; +constexpr auto default_cache_capacity = 5000; +} // namespace + +TEST(TestCache, TestGetCacheCapacityDefault) { + ASSERT_EQ(GetCacheCapacity(), default_cache_capacity); +} + +TEST(TestCache, TestGetCacheCapacityEnvVar) { + using ::arrow::EnvVarGuard; + + // Empty. + { + EnvVarGuard guard(cache_capacity_env_var, ""); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Non-number. + { + EnvVarGuard guard(cache_capacity_env_var, "invalid"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Number with invalid suffix. + { + EnvVarGuard guard(cache_capacity_env_var, "42MB"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Valid positive number. + { + EnvVarGuard guard(cache_capacity_env_var, "42"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42); + } + + // Int max. + { + auto str = std::to_string(std::numeric_limits::max()); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits::max()); + } + + // Zero. + { + EnvVarGuard guard(cache_capacity_env_var, "0"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Negative number. + { + EnvVarGuard guard(cache_capacity_env_var, "-1"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Over int max. + { + auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } +} + } // namespace gandiva diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 116c151824c75..0a082b0a5d859 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -181,6 +181,10 @@ that changing their value later will have an effect. The number of entries to keep in the Gandiva JIT compilation cache. The cache is in-memory and does not persist across processes. + The default cache size is 5000. The value of this environment variable + should be a positive integer and should not exceed the maximum value + of int32. Otherwise the default value is used. + .. envvar:: HADOOP_HOME The path to the Hadoop installation.