diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index b40b9af49356..476c786a63e3 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -177,6 +177,7 @@ jobs: - name: Make Debug Build env: VELOX_DEPENDENCY_SOURCE: BUNDLED + ICU_SOURCE: SYSTEM MAKEFLAGS: "NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=3" EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_PARQUET=ON" run: | diff --git a/CMake/resolve_dependency_modules/boost.cmake b/CMake/resolve_dependency_modules/boost.cmake index bdf6633d0375..842cba6f1334 100644 --- a/CMake/resolve_dependency_modules/boost.cmake +++ b/CMake/resolve_dependency_modules/boost.cmake @@ -13,30 +13,13 @@ # limitations under the License. include_guard(GLOBAL) -if(CMAKE_SYSTEM_NAME MATCHES "Darwin") - if(ON_APPLE_M1) - list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") - else() - list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") - endif() -endif() - -# ICU is only needed with Boost build from source -set_source(ICU) -resolve_dependency( - ICU - COMPONENTS - data - i18n - io - uc - tu - test) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/boost) -if(${ICU_SOURCE} STREQUAL "BUNDLED") - # ensure ICU is built before Boost - add_dependencies(boost_regex ICU ICU::i18n) + +if(ICU_SOURCE) + if(${ICU_SOURCE} STREQUAL "BUNDLED") + # ensure ICU is built before Boost + add_dependencies(boost_regex ICU ICU::i18n) + endif() endif() # This prevents system boost from leaking in diff --git a/CMakeLists.txt b/CMakeLists.txt index 97ba92170c0f..168eee277044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,6 +385,25 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + if(ON_APPLE_M1) + list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") + else() + list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") + endif() +endif() + +set_source(ICU) +resolve_dependency( + ICU + COMPONENTS + data + i18n + io + uc + tu + test) + set(BOOST_INCLUDE_LIBRARIES atomic context diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index 5874e98e9651..31ad1a9ae846 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -297,4 +297,3 @@ function install_apt_deps { fi fi ) - diff --git a/velox/external/date/patches/0006-add_get_time_zone_names.patch b/velox/external/date/patches/0006-add_get_time_zone_names.patch new file mode 100644 index 000000000000..f7b60effe4f3 --- /dev/null +++ b/velox/external/date/patches/0006-add_get_time_zone_names.patch @@ -0,0 +1,30 @@ +diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp +--- a/velox/external/date/tz.cpp ++++ b/velox/external/date/tz.cpp +@@ -3538,6 +3538,14 @@ + return get_tzdb_list().front(); + } + ++std::vector get_time_zone_names() { ++ std::vector result; ++ for (const auto& z : get_tzdb().zones) { ++ result.push_back(z.name()); ++ } ++ return result; ++} ++ + const time_zone* + #if HAS_STRING_VIEW + tzdb::locate_zone(std::string_view tz_name) const +diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h +--- a/velox/external/date/tz.h ++++ b/velox/external/date/tz.h +@@ -1258,6 +1258,8 @@ + + DATE_API const tzdb& get_tzdb(); + ++std::vector get_time_zone_names(); ++ + class tzdb_list + { + std::atomic head_{nullptr}; diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp index 69513d7d3145..13ebe93561da 100644 --- a/velox/external/date/tz.cpp +++ b/velox/external/date/tz.cpp @@ -3538,6 +3538,14 @@ get_tzdb() return get_tzdb_list().front(); } +std::vector get_time_zone_names() { + std::vector result; + for (const auto& z : get_tzdb().zones) { + result.push_back(z.name()); + } + return result; +} + const time_zone* #if HAS_STRING_VIEW tzdb::locate_zone(std::string_view tz_name) const diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h index 4ec0dbb44cfd..aa6d42c8d359 100644 --- a/velox/external/date/tz.h +++ b/velox/external/date/tz.h @@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db); DATE_API const tzdb& get_tzdb(); +std::vector get_time_zone_names(); + class tzdb_list { std::atomic head_{nullptr}; diff --git a/velox/functions/lib/CMakeLists.txt b/velox/functions/lib/CMakeLists.txt index bdff97bba2d9..5cee3af311d1 100644 --- a/velox/functions/lib/CMakeLists.txt +++ b/velox/functions/lib/CMakeLists.txt @@ -24,7 +24,8 @@ velox_link_libraries(velox_functions_util velox_vector velox_common_base) velox_add_library(velox_functions_lib_date_time_formatter DateTimeFormatter.cpp DateTimeFormatterBuilder.cpp) -velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz) +velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz + ICU::i18n ICU::uc) velox_add_library( velox_functions_lib diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index b86d3eb24b8a..32e1ab68f060 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -351,6 +351,133 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) { return -1; } +// Contains a list of all time zone names in a convenient format for searching. +// +// Time zone names without the '/' character (without a prefix) are stored in +// timeZoneNamesWithoutPrefix ordered by size desc. +// +// Time zone names with the '/' character (with a prefix) are stored in a map +// timeZoneNamePrefixMap from prefix (the string before the first '/') to a +// vector of strings which contains the suffixes (the strings after the first +// '/') ordered by size desc. +struct TimeZoneNameMappings { + std::vector timeZoneNamesWithoutPrefix; + std::unordered_map> + timeZoneNamePrefixMap; +}; + +TimeZoneNameMappings getTimeZoneNameMappings() { + // Here we use get_time_zone_names instead of calling get_tzdb and + // constructing the list ourselves because there is some unknown issue with + // the tz library where the time_zone objects after the first one in the tzdb + // will be invalid (contain nullptrs) after the get_tzdb function returns. + const std::vector timeZoneNames = date::get_time_zone_names(); + + TimeZoneNameMappings result; + for (size_t i = 0; i < timeZoneNames.size(); i++) { + const auto& timeZoneName = timeZoneNames[i]; + auto separatorPoint = timeZoneName.find('/'); + + if (separatorPoint == std::string::npos) { + result.timeZoneNamesWithoutPrefix.push_back(timeZoneName); + } else { + std::string prefix = timeZoneName.substr(0, separatorPoint); + std::string suffix = timeZoneName.substr(separatorPoint + 1); + + result.timeZoneNamePrefixMap[prefix].push_back(suffix); + } + } + + std::sort( + result.timeZoneNamesWithoutPrefix.begin(), + result.timeZoneNamesWithoutPrefix.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + + for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) { + std::sort( + suffixes.begin(), + suffixes.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + } + + return result; +} + +int64_t parseTimezoneName(const char* cur, const char* end, Date& date) { + // For time zone names we try to greedily find the longest substring starting + // from cur that is a valid time zone name. To help speed things along we + // treat time zone names as {prefix}/{suffix} (for the first instance of '/') + // and create lists of suffixes per prefix. We order these lists by length of + // the suffix so once we identify the prefix, we can return the first suffix + // we find in the string. We treat time zone names without a prefix (i.e. + // without a '/') separately but similarly. + static const TimeZoneNameMappings timeZoneNameMappings = + getTimeZoneNameMappings(); + + if (cur < end) { + // Find the first instance of '/' in the remainder of the string + const char* separatorPoint = cur; + while (separatorPoint < end && *separatorPoint != '/') { + ++separatorPoint; + } + + // Try to find a time zone with a prefix that includes the speratorPoint. + if (separatorPoint != end) { + std::string prefix(cur, separatorPoint); + + auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix); + if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) { + // This is greedy, find the longest suffix for the given prefix that + // fits the string. We know the value in the map is already sorted by + // length in decreasing order. + for (const auto& suffixName : it->second) { + if (suffixName.size() <= end - separatorPoint - 1 && + suffixName == + std::string_view(separatorPoint + 1, suffixName.size())) { + auto timeZoneNameSize = prefix.size() + 1 + suffixName.size(); + date.timezone = + tz::locateZone(std::string_view(cur, timeZoneNameSize), false); + + if (!date.timezone) { + return -1; + } + + return timeZoneNameSize; + } + } + } + } + + // If we found a '/' but didn't find a match in the set of time zones with + // prefixes, try search before the '/' for a time zone without a prefix. If + // we didn't find a '/' then end already equals separatorPoint. + end = separatorPoint; + + for (const auto& timeZoneName : + timeZoneNameMappings.timeZoneNamesWithoutPrefix) { + // Again, this is greedy, find the largest time zone name without a prefix + // that fits the string. We know timeZoneNamesWithoutPrefix is already + // sorted by length in decreasing order. + if (timeZoneName.size() <= end - cur && + timeZoneName == std::string_view(cur, timeZoneName.size())) { + date.timezone = tz::locateZone(timeZoneName, false); + + if (!date.timezone) { + return -1; + } + + return timeZoneName.size(); + } + } + } + + return -1; +} + int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) { // For timezone offset ids, there are three formats allowed by Joda: // @@ -518,7 +645,7 @@ std::string formatFractionOfSecond( return toAdd; } -int32_t appendTimezoneOffset(int64_t offset, char* result) { +int32_t appendTimezoneOffset(int64_t offset, char* result, bool includeColon) { int pos = 0; if (offset >= 0) { result[pos++] = '+'; @@ -536,7 +663,9 @@ int32_t appendTimezoneOffset(int64_t offset, char* result) { result[pos++] = char(hours % 10 + '0'); } - result[pos++] = ':'; + if (includeColon) { + result[pos++] = ':'; + } const auto minutes = (offset / 60) % 60; if LIKELY (minutes == 0) { @@ -687,12 +816,24 @@ int32_t parseFromPattern( bool specifierNext, DateTimeFormatterType type) { if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) { - auto size = parseTimezoneOffset(cur, end, date); + int64_t size; + if (curPattern.minRepresentDigits < 3) { + size = parseTimezoneOffset(cur, end, date); + } else { + size = parseTimezoneName(cur, end, date); + } + if (size == -1) { return -1; } cur += size; } else if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE) { + // JODA does not support parsing time zone long names, so neither do we for + // consistency. The pattern for a time zone long name is 4 or more 'z's. + VELOX_USER_CHECK_LT( + curPattern.minRepresentDigits, + 4, + "Parsing time zone long names is not supported."); auto size = parseTimezone(cur, end, date); if (size == -1) { return -1; @@ -1068,20 +1209,33 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { size += std::max((int)token.pattern.minRepresentDigits, 9); break; case DateTimeFormatSpecifier::TIMEZONE: - if (timezone == nullptr) { - VELOX_USER_FAIL("Timezone unknown"); + if (token.pattern.minRepresentDigits <= 3) { + // The longest abbreviation according to here is 5, e.g. some time + // zones use the offset as the abbreviation, like +0530. + // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + size += 5; + } else { + // The longest time zone long name is 40, Australian Central Western + // Standard Time. + // https://www.timeanddate.com/time/zones/ + size += 50; } - size += std::max( - token.pattern.minRepresentDigits, timezone->name().length()); + break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: - if (token.pattern.minRepresentDigits != 2) { - VELOX_UNSUPPORTED( - "Date format specifier is not supported: {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); + if (token.pattern.minRepresentDigits == 1) { + // 'Z' means output the time zone offset without a colon. + size += 8; + } else if (token.pattern.minRepresentDigits == 2) { + // 'ZZ' means output the time zone offset with a colon. + size += 9; + } else { + // 'ZZZ' (or more) means otuput the time zone ID. + if (timezone == nullptr) { + VELOX_USER_FAIL("Timezone unknown"); + } + size += timezone->name().length(); } - size += 9; break; // Not supported. case DateTimeFormatSpecifier::WEEK_YEAR: @@ -1310,37 +1464,43 @@ int32_t DateTimeFormatter::format( } break; case DateTimeFormatSpecifier::TIMEZONE: { - // TODO: implement short name time zone, need a map from full name to - // short name + VELOX_USER_CHECK_NOT_NULL( + timezone, + "The time zone cannot be formatted if it is not present."); if (token.pattern.minRepresentDigits <= 3) { - VELOX_UNSUPPORTED("short name time zone is not yet supported"); - } - if (timezone == nullptr) { - VELOX_USER_FAIL("Timezone unknown"); + const std::string& abbrev = timezone->getShortName( + std::chrono::milliseconds(timestamp.toMillis()), + tz::TimeZone::TChoose::kEarliest); + std::memcpy(result, abbrev.data(), abbrev.length()); + result += abbrev.length(); + } else { + std::string longName = timezone->getLongName( + std::chrono::milliseconds(timestamp.toMillis()), + tz::TimeZone::TChoose::kEarliest); + std::memcpy(result, longName.data(), longName.length()); + result += longName.length(); } - const auto& piece = timezone->name(); - std::memcpy(result, piece.data(), piece.length()); - result += piece.length(); } break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: { // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset // with a colon, 'ZZZ' or more outputs the zone id. - // TODO Add support for 'Z' and 'ZZZ'. - if (token.pattern.minRepresentDigits != 2) { - VELOX_UNSUPPORTED( - "format is not supported for specifier {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); - } - if (offset == 0 && zeroOffsetText.has_value()) { std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size()); result += zeroOffsetText->size(); break; } - result += appendTimezoneOffset(offset, result); + if (token.pattern.minRepresentDigits >= 3) { + // Append the time zone ID. + const auto& piece = timezone->name(); + std::memcpy(result, piece.data(), piece.length()); + result += piece.length(); + break; + } + + result += appendTimezoneOffset( + offset, result, token.pattern.minRepresentDigits == 2); break; } case DateTimeFormatSpecifier::WEEK_OF_WEEK_YEAR: { diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 16ae8a77b414..9cce58653252 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -2976,10 +2976,65 @@ TEST_F(DateTimeFunctionsTest, parseDatetime) { ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); EXPECT_EQ( ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); - + // Test a time zone with a prefix. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "2024-02-25+06:00:99 America/Los_Angeles", + "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix is greedy. Etc/GMT-1 and Etc/GMT-10 are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708804800000, "Etc/GMT-10"), + parseDatetime( + "2024-02-25+06:00:99 Etc/GMT-10", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone without a prefix is greedy. NZ and NZ-CHAT are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708791300000, "NZ-CHAT"), + parseDatetime("2024-02-25+06:00:99 NZ-CHAT", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "America/Los_Angeles2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test a time zone without a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test parsing can fall back to checking for time zones without a prefix when + // a '/' is present but not part of the time zone name. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT/2024-02-25+06:00:99", "ZZZ/yyyy-MM-dd+HH:mm:99")); + + // Test an invalid time zone without a prefix. (zzz should be used to match + // abbreviations) VELOX_ASSERT_THROW( parseDatetime("2024-02-25+06:00:99 PST", "yyyy-MM-dd+HH:mm:99 ZZZ"), "Invalid date format: '2024-02-25+06:00:99 PST'"); + // Test an invalid time zone with a prefix that doesn't appear at all. + VELOX_ASSERT_THROW( + parseDatetime("2024-02-25+06:00:99 ABC/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 ABC/XYZ'"); + // Test an invalid time zone with a prefix that does appear. + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 America/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 America/XYZ'"); + + // Test to ensure we do not support parsing time zone long names (to be + // consistent with JODA). + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 Pacific Standard Time", + "yyyy-MM-dd+HH:mm:99 zzzz"), + "Parsing time zone long names is not supported."); + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 Pacific Standard Time", + "yyyy-MM-dd+HH:mm:99 zzzzzzzzzz"), + "Parsing time zone long names is not supported."); } TEST_F(DateTimeFunctionsTest, formatDateTime) { @@ -3214,12 +3269,78 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "0010", formatDatetime(parseTimestamp("2022-01-01 03:30:30.001"), "SSSS")); - // Time zone test cases - 'z' + // Time zone test cases - 'Z' setQueryTimeZone("Asia/Kolkata"); EXPECT_EQ( - "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + "Asia/Kolkata", + formatDatetime( + parseTimestamp("1970-01-01"), "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ")); + EXPECT_EQ( + "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "ZZZZ")); + EXPECT_EQ( + "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "ZZZ")); EXPECT_EQ("+05:30", formatDatetime(parseTimestamp("1970-01-01"), "ZZ")); + EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-01-01"), "Z")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zz")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzzzzzzzzzzzzzzzzzzzz")); + + // Test daylight savings. + setQueryTimeZone("America/Los_Angeles"); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-03-10 01:00"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-03-10 03:00"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-11-03 01:00"), "z")); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-11-03 02:00"), "z")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-03-10 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-03-10 03:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-11-03 02:00"), "zzzz")); + + // Test ambiguous time. + EXPECT_EQ( + "PDT", formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzzz")); + + // Test a long abbreviation. + setQueryTimeZone("Asia/Colombo"); + EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + + // Test a long long name. + setQueryTimeZone("Australia/Eucla"); + EXPECT_EQ("+0845", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "Australian Central Western Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + + setQueryTimeZone("Asia/Kolkata"); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); EXPECT_EQ("'", formatDatetime(parseTimestamp("1970-01-01"), "''")); @@ -3243,12 +3364,12 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "AD 19 1970 4 Thu 1970 1 1 1 AM 8 8 8 8 3 11 5 Asia/Kolkata", formatDatetime( parseTimestamp("1970-01-01 02:33:11.5"), - "G C Y e E y D M d a K h H k m s S zzzz")); + "G C Y e E y D M d a K h H k m s S ZZZ")); EXPECT_EQ( "AD 19 1970 4 asdfghjklzxcvbnmqwertyuiop Thu ' 1970 1 1 1 AM 8 8 8 8 3 11 5 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ Asia/Kolkata", formatDatetime( parseTimestamp("1970-01-01 02:33:11.5"), - "G C Y e 'asdfghjklzxcvbnmqwertyuiop' E '' y D M d a K h H k m s S 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ zzzz")); + "G C Y e 'asdfghjklzxcvbnmqwertyuiop' E '' y D M d a K h H k m s S 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ ZZZ")); disableAdjustTimestampToTimezone(); EXPECT_EQ( @@ -3259,16 +3380,19 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { // User format errors or unsupported errors. EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "x"), VeloxUserError); + EXPECT_THROW( + formatDatetime(parseTimestamp("1970-01-01"), "q"), VeloxUserError); + EXPECT_THROW( + formatDatetime(parseTimestamp("1970-01-01"), "'abcd"), VeloxUserError); + + // Time zone name patterns aren't supported when there isn't a time zone + // available. EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "z"), VeloxUserError); EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "zz"), VeloxUserError); EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "zzz"), VeloxUserError); - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "q"), VeloxUserError); - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "'abcd"), VeloxUserError); } TEST_F(DateTimeFunctionsTest, formatDateTimeTimezone) { diff --git a/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp b/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp index f1865e55830e..de833dd77f2d 100644 --- a/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp +++ b/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp @@ -110,7 +110,7 @@ void castToString( const auto* timestamps = input.as>(); auto expectedFormatter = - functions::buildJodaDateTimeFormatter("yyyy-MM-dd HH:mm:ss.SSS zzzz"); + functions::buildJodaDateTimeFormatter("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); VELOX_CHECK( !expectedFormatter.hasError(), "Default format should always be valid, error: {}", diff --git a/velox/type/tz/CMakeLists.txt b/velox/type/tz/CMakeLists.txt index b7d96feb1262..614b594000a7 100644 --- a/velox/type/tz/CMakeLists.txt +++ b/velox/type/tz/CMakeLists.txt @@ -23,4 +23,6 @@ velox_link_libraries( velox_external_date Boost::regex fmt::fmt - Folly::folly) + Folly::folly + ICU::i18n + ICU::uc) diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index 93dacadba677..b68c88c6be04 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -20,6 +20,16 @@ #include #include #include + +#include +#include +#include + +// The ICU libraries define TRUE/FALSE macros which frequently conflict with +// other libraries that use these as enum/variable names. +#undef TRUE +#undef FALSE + #include "velox/common/base/Exceptions.h" #include "velox/common/testutil/TestValue.h" #include "velox/external/date/tz.h" @@ -242,6 +252,21 @@ void validateRangeImpl(time_point timePoint) { } } +template +date::zoned_time getZonedTime( + const date::time_zone* tz, + date::local_time timestamp, + TimeZone::TChoose choose) { + if (choose == TimeZone::TChoose::kFail) { + // By default, throws. + return date::zoned_time{tz, timestamp}; + } + + auto dateChoose = (choose == TimeZone::TChoose::kEarliest) + ? date::choose::earliest + : date::choose::latest; + return date::zoned_time{tz, timestamp, dateChoose}; +} } // namespace void validateRange(time_point timePoint) { @@ -337,17 +362,7 @@ TimeZone::seconds TimeZone::to_sys( return (timePoint - offset_).time_since_epoch(); } - if (choose == TimeZone::TChoose::kFail) { - // By default, throws. - return date::zoned_time{tz_, timePoint}.get_sys_time().time_since_epoch(); - } - - auto dateChoose = (choose == TimeZone::TChoose::kEarliest) - ? date::choose::earliest - : date::choose::latest; - return date::zoned_time{tz_, timePoint, dateChoose} - .get_sys_time() - .time_since_epoch(); + return getZonedTime(tz_, timePoint, choose).get_sys_time().time_since_epoch(); } TimeZone::seconds TimeZone::to_local(TimeZone::seconds timestamp) const { @@ -361,4 +376,61 @@ TimeZone::seconds TimeZone::to_local(TimeZone::seconds timestamp) const { return date::zoned_time{tz_, timePoint}.get_local_time().time_since_epoch(); } +std::string TimeZone::getShortName( + TimeZone::milliseconds timestamp, + TimeZone::TChoose choose) const { + date::local_time timePoint{timestamp}; + validateRange(date::sys_time(timestamp)); + + // Time zone offsets only have one name (no abbreviations). + if (tz_ == nullptr) { + return timeZoneName_; + } + + return getZonedTime(tz_, timePoint, choose).get_info().abbrev; +} + +std::string TimeZone::getLongName( + TimeZone::milliseconds timestamp, + TimeZone::TChoose choose) const { + static const icu::Locale locale("en", "US"); + + validateRange(date::sys_time(timestamp)); + + // Time zone offsets only have one name. + if (tz_ == nullptr) { + return timeZoneName_; + } + + // Special case for UTC. ICU uses "GMT" for some reason which is an + // abbreviation. + if (timeZoneID_ == 0) { + return "Coordinated Universal Time"; + } + + // Get the ICU TimeZone by name + std::unique_ptr tz(icu::TimeZone::createTimeZone( + icu::UnicodeString(timeZoneName_.data(), timeZoneName_.length()))); + VELOX_USER_CHECK_NOT_NULL(tz); + + // According to the documentation this is how to determine if DST applies to + // a given timestamp in a given time zone. + // https://howardhinnant.github.io/date/tz.html#sys_info + date::local_time timePoint{timestamp}; + bool isDst = getZonedTime(tz_, timePoint, choose).get_info().save != + std::chrono::minutes(0); + + // Construct the long name for the time zone. + // Note that ICU does not have DST information for many time zones prior to + // 1970, so it's important to specify it explicitly. + icu::UnicodeString longName; + tz->getDisplayName( + isDst, icu::TimeZone::EDisplayType::LONG, locale, longName); + + // Convert the UnicodeString back to a string and write it out + std::string longNameStr; + longName.toUTF8String(longNameStr); + + return longNameStr; +} } // namespace facebook::velox::tz diff --git a/velox/type/tz/TimeZoneMap.h b/velox/type/tz/TimeZoneMap.h index 9554d7506328..c04cc308657b 100644 --- a/velox/type/tz/TimeZoneMap.h +++ b/velox/type/tz/TimeZoneMap.h @@ -113,6 +113,7 @@ class TimeZone { } using seconds = std::chrono::seconds; + using milliseconds = std::chrono::milliseconds; /// Converts a local time (the time as perceived in the user time zone /// represented by this object) to a system time (the corresponding time in @@ -151,6 +152,22 @@ class TimeZone { return tz_; } + /// Returns the short name (abbreviation) of the time zone for the given + /// timestamp. Note that the timestamp is needed for time zones that support + /// daylight savings time as the short name will change depending on the date + /// (e.g. PST/PDT). + std::string getShortName( + milliseconds timestamp, + TChoose choose = TChoose::kFail) const; + + /// Returns the long name of the time zone for the given timestamp, e.g. + /// Pacific Standard Time. Note that the timestamp is needed for time zones + /// that support daylight savings time as the long name will change depending + /// on the date (e.g. Pacific Standard Time vs Pacific Daylight Time). + std::string getLongName( + milliseconds timestamp, + TChoose choose = TChoose::kFail) const; + private: const date::time_zone* tz_{nullptr}; const std::chrono::minutes offset_{0}; diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index c56d6ef9c3ac..364fd2baf297 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -234,5 +234,54 @@ TEST(TimeZoneMapTest, invalid) { VELOX_ASSERT_THROW(getTimeZoneID("etc/GMT+300"), "Unknown time zone"); } +TEST(TimeZoneMapTest, getShortName) { + auto toShortName = [&](std::string_view name, size_t ts) { + const auto* tz = locateZone(name); + EXPECT_NE(tz, nullptr); + return tz->getShortName(milliseconds{ts}); + }; + + // Test an offset that maps to an actual time zone. + EXPECT_EQ("UTC", toShortName("+00:00", 0)); + + // Test offsets that do not map to named time zones. + EXPECT_EQ("+00:01", toShortName("+00:01", 0)); + EXPECT_EQ("-00:01", toShortName("-00:01", 0)); + EXPECT_EQ("+01:00", toShortName("+01:00", 0)); + EXPECT_EQ("-01:01", toShortName("-01:01", 0)); + + // In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07). + size_t ts = 1721890800000; + EXPECT_EQ("PDT", toShortName("America/Los_Angeles", ts)); + + // In "2024-01-01", it was not (UTC-08). + ts = 1704096000000; + EXPECT_EQ("PST", toShortName("America/Los_Angeles", ts)); +} + +TEST(TimeZoneMapTest, getLongName) { + auto toLongName = [&](std::string_view name, size_t ts) { + const auto* tz = locateZone(name); + EXPECT_NE(tz, nullptr); + return tz->getLongName(milliseconds{ts}); + }; + + // Test an offset that maps to an actual time zone. + EXPECT_EQ("Coordinated Universal Time", toLongName("+00:00", 0)); + + // Test offsets that do not map to named time zones. + EXPECT_EQ("+00:01", toLongName("+00:01", 0)); + EXPECT_EQ("-00:01", toLongName("-00:01", 0)); + EXPECT_EQ("+01:00", toLongName("+01:00", 0)); + EXPECT_EQ("-01:01", toLongName("-01:01", 0)); + + // In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07). + size_t ts = 1721890800000; + EXPECT_EQ("Pacific Daylight Time", toLongName("America/Los_Angeles", ts)); + + // In "2024-01-01", it was not (UTC-08). + ts = 1704096000000; + EXPECT_EQ("Pacific Standard Time", toLongName("America/Los_Angeles", ts)); +} } // namespace } // namespace facebook::velox::tz