From c529fedda47a307f5ebb160049a09834dd776363 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Thu, 24 Oct 2024 14:29:55 -0700 Subject: [PATCH] Add support for zzzz (and beyond) in format_datetime (#11330) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11330 This diff adds support for JODA's zzzz (or more) patterns (all equivalent) in Presto's forma_datetime function. This is used to format long time zone names. Long time zone names are not available from the IANA time zone database, so we can't use the tz library to generate these. Fortunately, unicode provides some utilities to generate these. Differential Revision: D64795407 --- CMakeLists.txt | 12 ++++++ velox/functions/lib/CMakeLists.txt | 3 +- velox/functions/lib/DateTimeFormatter.cpp | 41 ++++++++++++++++--- .../prestosql/tests/DateTimeFunctionsTest.cpp | 26 ++++++++++-- 4 files changed, 71 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1dd401d3e1472..055d54ab47dc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -472,6 +472,18 @@ add_compile_definitions(FOLLY_HAVE_INT128_T=1) set_source(folly) resolve_dependency(folly) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + if(ON_APPLE_M1) + list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") + else() + list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") + endif() +endif() + +# ICU is only needed with Boost build from source +set_source(ICU) +resolve_dependency(ICU COMPONENTS i18n) + if(${VELOX_BUILD_TESTING}) # Spark qury runner depends on absl, gRPC. set_source(absl) diff --git a/velox/functions/lib/CMakeLists.txt b/velox/functions/lib/CMakeLists.txt index bdff97bba2d95..13d6d5259917d 100644 --- a/velox/functions/lib/CMakeLists.txt +++ b/velox/functions/lib/CMakeLists.txt @@ -24,7 +24,8 @@ velox_link_libraries(velox_functions_util velox_vector velox_common_base) velox_add_library(velox_functions_lib_date_time_formatter DateTimeFormatter.cpp DateTimeFormatterBuilder.cpp) -velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz) +velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz + ICU::i18n) velox_add_library( velox_functions_lib diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 835274cf7c5ae..92d233b3f946b 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -16,6 +16,10 @@ #include "velox/functions/lib/DateTimeFormatter.h" #include +#include +#include +#include +#include #include #include #include @@ -1173,10 +1177,10 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones size += 5; } else { - VELOX_NYI( - "Date format specifier is not yet implemented: {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); + // The longest time zone long name is 40, Australian Central Western + // Standard Time. + // https://www.timeanddate.com/time/zones/ + size += 50; } break; @@ -1432,8 +1436,33 @@ int32_t DateTimeFormatter::format( std::memcpy(result, abbrev.data(), abbrev.length()); result += abbrev.length(); } else { - // TODO: implement full name time zone - VELOX_NYI("full time zone name is not yet supported"); + UErrorCode success = U_ZERO_ERROR; + + static const icu::Locale locale("en", "US"); + static const std::unique_ptr format( + icu::TimeZoneFormat::createInstance(locale, success)); + VELOX_USER_CHECK_NOT_NULL(format); + + // Get the ICU TimeZone by name + const std::string& timeZoneName = timezone->name(); + std::unique_ptr tz( + icu::TimeZone::createTimeZone(icu::UnicodeString( + timeZoneName.data(), timeZoneName.length()))); + VELOX_USER_CHECK_NOT_NULL(tz); + + // Format the time zone to get the long name. + icu::UnicodeString longName; + format->format( + UTimeZoneFormatStyle::UTZFMT_STYLE_SPECIFIC_LONG, + *tz, + (double)timestamp.getSeconds() * 1000.0, // ICU expects a double + longName); + + // Convert the UnicodeString back to a string and write it out + std::string longNameStr; + longName.toUTF8String(longNameStr); + std::memcpy(result, longNameStr.data(), longNameStr.length()); + result += longNameStr.length(); } } break; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 661874dda3fbd..530c5fffa4bf3 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3272,21 +3272,39 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zz")); EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzzzzzzzzzzzzzzzzzzzz")); // Test daylight savings. setQueryTimeZone("America/Los_Angeles"); EXPECT_EQ("PST", formatDatetime(parseTimestamp("1970-01-01"), "z")); EXPECT_EQ("PDT", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); // Test a long abbreviation. setQueryTimeZone("Asia/Colombo"); EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); - setQueryTimeZone("Asia/Kolkata"); - // We don't support more than 3 'z's yet. - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zzzz"), VeloxRuntimeError); + // Test a long long name. + setQueryTimeZone("Australia/Eucla"); + EXPECT_EQ("+0845", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "Australian Central Western Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + setQueryTimeZone("Asia/Kolkata"); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); EXPECT_EQ("'", formatDatetime(parseTimestamp("1970-01-01"), "''"));