diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index b40b9af49356..476c786a63e3 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -177,6 +177,7 @@ jobs: - name: Make Debug Build env: VELOX_DEPENDENCY_SOURCE: BUNDLED + ICU_SOURCE: SYSTEM MAKEFLAGS: "NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=3" EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_PARQUET=ON" run: | diff --git a/CMake/resolve_dependency_modules/boost.cmake b/CMake/resolve_dependency_modules/boost.cmake index bdf6633d0375..842cba6f1334 100644 --- a/CMake/resolve_dependency_modules/boost.cmake +++ b/CMake/resolve_dependency_modules/boost.cmake @@ -13,30 +13,13 @@ # limitations under the License. include_guard(GLOBAL) -if(CMAKE_SYSTEM_NAME MATCHES "Darwin") - if(ON_APPLE_M1) - list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") - else() - list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") - endif() -endif() - -# ICU is only needed with Boost build from source -set_source(ICU) -resolve_dependency( - ICU - COMPONENTS - data - i18n - io - uc - tu - test) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/boost) -if(${ICU_SOURCE} STREQUAL "BUNDLED") - # ensure ICU is built before Boost - add_dependencies(boost_regex ICU ICU::i18n) + +if(ICU_SOURCE) + if(${ICU_SOURCE} STREQUAL "BUNDLED") + # ensure ICU is built before Boost + add_dependencies(boost_regex ICU ICU::i18n) + endif() endif() # This prevents system boost from leaking in diff --git a/CMakeLists.txt b/CMakeLists.txt index 97ba92170c0f..168eee277044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,6 +385,25 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + if(ON_APPLE_M1) + list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") + else() + list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") + endif() +endif() + +set_source(ICU) +resolve_dependency( + ICU + COMPONENTS + data + i18n + io + uc + tu + test) + set(BOOST_INCLUDE_LIBRARIES atomic context diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index 5874e98e9651..31ad1a9ae846 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -297,4 +297,3 @@ function install_apt_deps { fi fi ) - diff --git a/velox/external/date/patches/0006-add_get_time_zone_names.patch b/velox/external/date/patches/0006-add_get_time_zone_names.patch new file mode 100644 index 000000000000..f7b60effe4f3 --- /dev/null +++ b/velox/external/date/patches/0006-add_get_time_zone_names.patch @@ -0,0 +1,30 @@ +diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp +--- a/velox/external/date/tz.cpp ++++ b/velox/external/date/tz.cpp +@@ -3538,6 +3538,14 @@ + return get_tzdb_list().front(); + } + ++std::vector get_time_zone_names() { ++ std::vector result; ++ for (const auto& z : get_tzdb().zones) { ++ result.push_back(z.name()); ++ } ++ return result; ++} ++ + const time_zone* + #if HAS_STRING_VIEW + tzdb::locate_zone(std::string_view tz_name) const +diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h +--- a/velox/external/date/tz.h ++++ b/velox/external/date/tz.h +@@ -1258,6 +1258,8 @@ + + DATE_API const tzdb& get_tzdb(); + ++std::vector get_time_zone_names(); ++ + class tzdb_list + { + std::atomic head_{nullptr}; diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp index 69513d7d3145..13ebe93561da 100644 --- a/velox/external/date/tz.cpp +++ b/velox/external/date/tz.cpp @@ -3538,6 +3538,14 @@ get_tzdb() return get_tzdb_list().front(); } +std::vector get_time_zone_names() { + std::vector result; + for (const auto& z : get_tzdb().zones) { + result.push_back(z.name()); + } + return result; +} + const time_zone* #if HAS_STRING_VIEW tzdb::locate_zone(std::string_view tz_name) const diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h index 4ec0dbb44cfd..aa6d42c8d359 100644 --- a/velox/external/date/tz.h +++ b/velox/external/date/tz.h @@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db); DATE_API const tzdb& get_tzdb(); +std::vector get_time_zone_names(); + class tzdb_list { std::atomic head_{nullptr}; diff --git a/velox/functions/lib/CMakeLists.txt b/velox/functions/lib/CMakeLists.txt index bdff97bba2d9..5cee3af311d1 100644 --- a/velox/functions/lib/CMakeLists.txt +++ b/velox/functions/lib/CMakeLists.txt @@ -24,7 +24,8 @@ velox_link_libraries(velox_functions_util velox_vector velox_common_base) velox_add_library(velox_functions_lib_date_time_formatter DateTimeFormatter.cpp DateTimeFormatterBuilder.cpp) -velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz) +velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz + ICU::i18n ICU::uc) velox_add_library( velox_functions_lib diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index b86d3eb24b8a..598146f28812 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -27,6 +27,11 @@ #include "velox/type/TimestampConversion.h" #include "velox/type/tz/TimeZoneMap.h" +namespace facebook::velox::tz { +// Defined in TimeZoneLinks.cpp +extern const std::unordered_map& getTimeZoneLinks(); +} // namespace facebook::velox::tz + namespace facebook::velox::functions { static thread_local std::string timezoneBuffer = "+00:00"; @@ -351,6 +356,133 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) { return -1; } +// Contains a list of all time zone names in a convenient format for searching. +// +// Time zone names without the '/' character (without a prefix) are stored in +// timeZoneNamesWithoutPrefix ordered by size desc. +// +// Time zone names with the '/' character (with a prefix) are stored in a map +// timeZoneNamePrefixMap from prefix (the string before the first '/') to a +// vector of strings which contains the suffixes (the strings after the first +// '/') ordered by size desc. +struct TimeZoneNameMappings { + std::vector timeZoneNamesWithoutPrefix; + std::unordered_map> + timeZoneNamePrefixMap; +}; + +TimeZoneNameMappings getTimeZoneNameMappings() { + // Here we use get_time_zone_names instead of calling get_tzdb and + // constructing the list ourselves because there is some unknown issue with + // the tz library where the time_zone objects after the first one in the tzdb + // will be invalid (contain nullptrs) after the get_tzdb function returns. + const std::vector timeZoneNames = date::get_time_zone_names(); + + TimeZoneNameMappings result; + for (size_t i = 0; i < timeZoneNames.size(); i++) { + const auto& timeZoneName = timeZoneNames[i]; + auto separatorPoint = timeZoneName.find('/'); + + if (separatorPoint == std::string::npos) { + result.timeZoneNamesWithoutPrefix.push_back(timeZoneName); + } else { + std::string prefix = timeZoneName.substr(0, separatorPoint); + std::string suffix = timeZoneName.substr(separatorPoint + 1); + + result.timeZoneNamePrefixMap[prefix].push_back(suffix); + } + } + + std::sort( + result.timeZoneNamesWithoutPrefix.begin(), + result.timeZoneNamesWithoutPrefix.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + + for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) { + std::sort( + suffixes.begin(), + suffixes.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + } + + return result; +} + +int64_t parseTimezoneName(const char* cur, const char* end, Date& date) { + // For time zone names we try to greedily find the longest substring starting + // from cur that is a valid time zone name. To help speed things along we + // treat time zone names as {prefix}/{suffix} (for the first instance of '/') + // and create lists of suffixes per prefix. We order these lists by length of + // the suffix so once we identify the prefix, we can return the first suffix + // we find in the string. We treat time zone names without a prefix (i.e. + // without a '/') separately but similarly. + static const TimeZoneNameMappings timeZoneNameMappings = + getTimeZoneNameMappings(); + + if (cur < end) { + // Find the first instance of '/' in the remainder of the string + const char* separatorPoint = cur; + while (separatorPoint < end && *separatorPoint != '/') { + ++separatorPoint; + } + + // Try to find a time zone with a prefix that includes the speratorPoint. + if (separatorPoint != end) { + std::string prefix(cur, separatorPoint); + + auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix); + if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) { + // This is greedy, find the longest suffix for the given prefix that + // fits the string. We know the value in the map is already sorted by + // length in decreasing order. + for (const auto& suffixName : it->second) { + if (suffixName.size() <= end - separatorPoint - 1 && + suffixName == + std::string_view(separatorPoint + 1, suffixName.size())) { + auto timeZoneNameSize = prefix.size() + 1 + suffixName.size(); + date.timezone = + tz::locateZone(std::string_view(cur, timeZoneNameSize), false); + + if (!date.timezone) { + return -1; + } + + return timeZoneNameSize; + } + } + } + } + + // If we found a '/' but didn't find a match in the set of time zones with + // prefixes, try search before the '/' for a time zone without a prefix. If + // we didn't find a '/' then end already equals separatorPoint. + end = separatorPoint; + + for (const auto& timeZoneName : + timeZoneNameMappings.timeZoneNamesWithoutPrefix) { + // Again, this is greedy, find the largest time zone name without a prefix + // that fits the string. We know timeZoneNamesWithoutPrefix is already + // sorted by length in decreasing order. + if (timeZoneName.size() <= end - cur && + timeZoneName == std::string_view(cur, timeZoneName.size())) { + date.timezone = tz::locateZone(timeZoneName, false); + + if (!date.timezone) { + return -1; + } + + return timeZoneName.size(); + } + } + } + + return -1; +} + int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) { // For timezone offset ids, there are three formats allowed by Joda: // @@ -518,7 +650,7 @@ std::string formatFractionOfSecond( return toAdd; } -int32_t appendTimezoneOffset(int64_t offset, char* result) { +int32_t appendTimezoneOffset(int64_t offset, char* result, bool includeColon) { int pos = 0; if (offset >= 0) { result[pos++] = '+'; @@ -536,7 +668,9 @@ int32_t appendTimezoneOffset(int64_t offset, char* result) { result[pos++] = char(hours % 10 + '0'); } - result[pos++] = ':'; + if (includeColon) { + result[pos++] = ':'; + } const auto minutes = (offset / 60) % 60; if LIKELY (minutes == 0) { @@ -687,12 +821,24 @@ int32_t parseFromPattern( bool specifierNext, DateTimeFormatterType type) { if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) { - auto size = parseTimezoneOffset(cur, end, date); + int64_t size; + if (curPattern.minRepresentDigits < 3) { + size = parseTimezoneOffset(cur, end, date); + } else { + size = parseTimezoneName(cur, end, date); + } + if (size == -1) { return -1; } cur += size; } else if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE) { + // JODA does not support parsing time zone long names, so neither do we for + // consistency. The pattern for a time zone long name is 4 or more 'z's. + VELOX_USER_CHECK_LT( + curPattern.minRepresentDigits, + 4, + "Parsing time zone long names is not supported."); auto size = parseTimezone(cur, end, date); if (size == -1) { return -1; @@ -1068,20 +1214,35 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { size += std::max((int)token.pattern.minRepresentDigits, 9); break; case DateTimeFormatSpecifier::TIMEZONE: - if (timezone == nullptr) { - VELOX_USER_FAIL("Timezone unknown"); + if (token.pattern.minRepresentDigits <= 3) { + // The longest abbreviation according to here is 5, e.g. some time + // zones use the offset as the abbreviation, like +0530. + // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + size += 5; + } else { + // The longest time zone long name is 40, Australian Central Western + // Standard Time. + // https://www.timeanddate.com/time/zones/ + size += 50; } - size += std::max( - token.pattern.minRepresentDigits, timezone->name().length()); + break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: - if (token.pattern.minRepresentDigits != 2) { - VELOX_UNSUPPORTED( - "Date format specifier is not supported: {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); + if (token.pattern.minRepresentDigits == 1) { + // 'Z' means output the time zone offset without a colon. + size += 8; + } else if (token.pattern.minRepresentDigits == 2) { + // 'ZZ' means output the time zone offset with a colon. + size += 9; + } else { + // 'ZZZ' (or more) means otuput the time zone ID. + if (timezone == nullptr) { + VELOX_USER_FAIL("Timezone unknown"); + } + + // The longest time zone ID is 32, America/Argentina/ComodRivadavia. + size += 32; } - size += 9; break; // Not supported. case DateTimeFormatSpecifier::WEEK_YEAR: @@ -1310,37 +1471,53 @@ int32_t DateTimeFormatter::format( } break; case DateTimeFormatSpecifier::TIMEZONE: { - // TODO: implement short name time zone, need a map from full name to - // short name + VELOX_USER_CHECK_NOT_NULL( + timezone, + "The time zone cannot be formatted if it is not present."); if (token.pattern.minRepresentDigits <= 3) { - VELOX_UNSUPPORTED("short name time zone is not yet supported"); - } - if (timezone == nullptr) { - VELOX_USER_FAIL("Timezone unknown"); + const std::string& abbrev = timezone->getShortName( + std::chrono::milliseconds(timestamp.toMillis()), + tz::TimeZone::TChoose::kEarliest); + std::memcpy(result, abbrev.data(), abbrev.length()); + result += abbrev.length(); + } else { + std::string longName = timezone->getLongName( + std::chrono::milliseconds(timestamp.toMillis()), + tz::TimeZone::TChoose::kEarliest); + std::memcpy(result, longName.data(), longName.length()); + result += longName.length(); } - const auto& piece = timezone->name(); - std::memcpy(result, piece.data(), piece.length()); - result += piece.length(); } break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: { // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset // with a colon, 'ZZZ' or more outputs the zone id. - // TODO Add support for 'Z' and 'ZZZ'. - if (token.pattern.minRepresentDigits != 2) { - VELOX_UNSUPPORTED( - "format is not supported for specifier {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); - } - if (offset == 0 && zeroOffsetText.has_value()) { std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size()); result += zeroOffsetText->size(); break; } - result += appendTimezoneOffset(offset, result); + if (token.pattern.minRepresentDigits >= 3) { + // Append the time zone ID. + const auto& piece = timezone->name(); + + static const auto& timeZoneLinks = tz::getTimeZoneLinks(); + auto timeZoneLinksIter = timeZoneLinks.find(piece); + if (timeZoneLinksIter != timeZoneLinks.end()) { + const auto& timeZoneLink = timeZoneLinksIter->second; + std::memcpy(result, timeZoneLink.data(), timeZoneLink.length()); + result += timeZoneLink.length(); + break; + } + + std::memcpy(result, piece.data(), piece.length()); + result += piece.length(); + break; + } + + result += appendTimezoneOffset( + offset, result, token.pattern.minRepresentDigits == 2); break; } case DateTimeFormatSpecifier::WEEK_OF_WEEK_YEAR: { diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 16ae8a77b414..20265004aafa 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -2976,10 +2976,65 @@ TEST_F(DateTimeFunctionsTest, parseDatetime) { ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); EXPECT_EQ( ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); - + // Test a time zone with a prefix. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "2024-02-25+06:00:99 America/Los_Angeles", + "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix is greedy. Etc/GMT-1 and Etc/GMT-10 are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708804800000, "Etc/GMT-10"), + parseDatetime( + "2024-02-25+06:00:99 Etc/GMT-10", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone without a prefix is greedy. NZ and NZ-CHAT are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708791300000, "NZ-CHAT"), + parseDatetime("2024-02-25+06:00:99 NZ-CHAT", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "America/Los_Angeles2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test a time zone without a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test parsing can fall back to checking for time zones without a prefix when + // a '/' is present but not part of the time zone name. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT/2024-02-25+06:00:99", "ZZZ/yyyy-MM-dd+HH:mm:99")); + + // Test an invalid time zone without a prefix. (zzz should be used to match + // abbreviations) VELOX_ASSERT_THROW( parseDatetime("2024-02-25+06:00:99 PST", "yyyy-MM-dd+HH:mm:99 ZZZ"), "Invalid date format: '2024-02-25+06:00:99 PST'"); + // Test an invalid time zone with a prefix that doesn't appear at all. + VELOX_ASSERT_THROW( + parseDatetime("2024-02-25+06:00:99 ABC/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 ABC/XYZ'"); + // Test an invalid time zone with a prefix that does appear. + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 America/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 America/XYZ'"); + + // Test to ensure we do not support parsing time zone long names (to be + // consistent with JODA). + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 Pacific Standard Time", + "yyyy-MM-dd+HH:mm:99 zzzz"), + "Parsing time zone long names is not supported."); + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 Pacific Standard Time", + "yyyy-MM-dd+HH:mm:99 zzzzzzzzzz"), + "Parsing time zone long names is not supported."); } TEST_F(DateTimeFunctionsTest, formatDateTime) { @@ -3214,12 +3269,89 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "0010", formatDatetime(parseTimestamp("2022-01-01 03:30:30.001"), "SSSS")); - // Time zone test cases - 'z' + // Time zone test cases - 'Z' setQueryTimeZone("Asia/Kolkata"); EXPECT_EQ( - "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + "Asia/Kolkata", + formatDatetime( + parseTimestamp("1970-01-01"), "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ")); + EXPECT_EQ( + "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "ZZZZ")); + EXPECT_EQ( + "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "ZZZ")); EXPECT_EQ("+05:30", formatDatetime(parseTimestamp("1970-01-01"), "ZZ")); + EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-01-01"), "Z")); + + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zz")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzzzzzzzzzzzzzzzzzzzz")); + + // Test daylight savings. + setQueryTimeZone("America/Los_Angeles"); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-03-10 01:00"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-03-10 03:00"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-11-03 01:00"), "z")); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-11-03 02:00"), "z")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-03-10 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-03-10 03:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-11-03 02:00"), "zzzz")); + // Test ambiguous time. + EXPECT_EQ( + "PDT", formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzzz")); + + // Test a long abbreviation. + setQueryTimeZone("Asia/Colombo"); + EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + + // Test a long long name. + setQueryTimeZone("Australia/Eucla"); + EXPECT_EQ("+0845", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "Australian Central Western Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + + // Test a time zone name that is linked to another (that gets replaced when + // converted to a string). + setQueryTimeZone("US/Pacific"); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "America/Los_Angeles", + formatDatetime(parseTimestamp("1970-01-01"), "ZZZ")); + + setQueryTimeZone("Asia/Kolkata"); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); EXPECT_EQ("'", formatDatetime(parseTimestamp("1970-01-01"), "''")); @@ -3243,12 +3375,12 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "AD 19 1970 4 Thu 1970 1 1 1 AM 8 8 8 8 3 11 5 Asia/Kolkata", formatDatetime( parseTimestamp("1970-01-01 02:33:11.5"), - "G C Y e E y D M d a K h H k m s S zzzz")); + "G C Y e E y D M d a K h H k m s S ZZZ")); EXPECT_EQ( "AD 19 1970 4 asdfghjklzxcvbnmqwertyuiop Thu ' 1970 1 1 1 AM 8 8 8 8 3 11 5 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ Asia/Kolkata", formatDatetime( parseTimestamp("1970-01-01 02:33:11.5"), - "G C Y e 'asdfghjklzxcvbnmqwertyuiop' E '' y D M d a K h H k m s S 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ zzzz")); + "G C Y e 'asdfghjklzxcvbnmqwertyuiop' E '' y D M d a K h H k m s S 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ ZZZ")); disableAdjustTimestampToTimezone(); EXPECT_EQ( @@ -3259,16 +3391,19 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { // User format errors or unsupported errors. EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "x"), VeloxUserError); + EXPECT_THROW( + formatDatetime(parseTimestamp("1970-01-01"), "q"), VeloxUserError); + EXPECT_THROW( + formatDatetime(parseTimestamp("1970-01-01"), "'abcd"), VeloxUserError); + + // Time zone name patterns aren't supported when there isn't a time zone + // available. EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "z"), VeloxUserError); EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "zz"), VeloxUserError); EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "zzz"), VeloxUserError); - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "q"), VeloxUserError); - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "'abcd"), VeloxUserError); } TEST_F(DateTimeFunctionsTest, formatDateTimeTimezone) { diff --git a/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp b/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp index 6b9990ada924..ddd100c6b46a 100644 --- a/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp +++ b/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp @@ -125,7 +125,8 @@ TEST_F(TimestampWithTimeZoneCastTest, toVarchar) { "1970-01-01 01:11:37.123 America/New_York", "1969-12-31 22:11:37.123 America/Los_Angeles", "1970-01-01 14:11:37.123 Asia/Shanghai", - "1970-01-01 11:41:37.123 Asia/Calcutta", + "1970-01-01 11:41:37.123 Asia/Kolkata", // Asia/Calcutta is linked to + // Asia/Kolkata. }); auto result = evaluate("cast(c0 as varchar)", makeRowVector({input})); diff --git a/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp b/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp index f1865e55830e..de833dd77f2d 100644 --- a/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp +++ b/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp @@ -110,7 +110,7 @@ void castToString( const auto* timestamps = input.as>(); auto expectedFormatter = - functions::buildJodaDateTimeFormatter("yyyy-MM-dd HH:mm:ss.SSS zzzz"); + functions::buildJodaDateTimeFormatter("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); VELOX_CHECK( !expectedFormatter.hasError(), "Default format should always be valid, error: {}", diff --git a/velox/type/tz/CMakeLists.txt b/velox/type/tz/CMakeLists.txt index b7d96feb1262..8f7c2381b823 100644 --- a/velox/type/tz/CMakeLists.txt +++ b/velox/type/tz/CMakeLists.txt @@ -14,8 +14,12 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -velox_add_library(velox_type_tz TimeZoneMap.h TimeZoneDatabase.cpp - TimeZoneMap.cpp) +velox_add_library( + velox_type_tz + TimeZoneMap.h + TimeZoneDatabase.cpp + TimeZoneLinks.cpp + TimeZoneMap.cpp) velox_link_libraries( velox_type_tz @@ -23,4 +27,6 @@ velox_link_libraries( velox_external_date Boost::regex fmt::fmt - Folly::folly) + Folly::folly + ICU::i18n + ICU::uc) diff --git a/velox/type/tz/TimeZoneLinks.cpp b/velox/type/tz/TimeZoneLinks.cpp new file mode 100644 index 000000000000..de1e381ceb91 --- /dev/null +++ b/velox/type/tz/TimeZoneLinks.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is generated. Do not modify it manually. To re-generate it, run: +// +// ./velox/type/tz/gen_timezone_links.py -f /tmp/backward \ +// > velox/type/tz/TimeZoneLinks.cpp +// +// The backward file should be the same one used in JODA, +// The latest is available here : +// https://github.com/JodaOrg/global-tz/blob/global-tz/backward. +// Presto Java currently uses the one found here: +// https://github.com/JodaOrg/global-tz/blob/2024agtz/backward +// To find the current version in Presto Java, check this file at the version of +// JODA Presto Java is using: +// https://github.com/JodaOrg/joda-time/blob/v2.12.7/src/changes/changes.xml +// @generated + +#include +#include + +namespace facebook::velox::tz { + +const std::unordered_map& getTimeZoneLinks() { + static auto* tzLinks = new std::unordered_map([] { + // Work around clang compiler bug causing multi-hour compilation + // with -fsanitize=fuzzer + // https://github.com/llvm/llvm-project/issues/75666 + return std::unordered_map{ + {"Australia/ACT", "Australia/Sydney"}, + {"Australia/LHI", "Australia/Lord_Howe"}, + {"Australia/NSW", "Australia/Sydney"}, + {"Australia/North", "Australia/Darwin"}, + {"Australia/Queensland", "Australia/Brisbane"}, + {"Australia/South", "Australia/Adelaide"}, + {"Australia/Tasmania", "Australia/Hobart"}, + {"Australia/Victoria", "Australia/Melbourne"}, + {"Australia/West", "Australia/Perth"}, + {"Australia/Yancowinna", "Australia/Broken_Hill"}, + {"Brazil/Acre", "America/Rio_Branco"}, + {"Brazil/DeNoronha", "America/Noronha"}, + {"Brazil/East", "America/Sao_Paulo"}, + {"Brazil/West", "America/Manaus"}, + {"Canada/Atlantic", "America/Halifax"}, + {"Canada/Central", "America/Winnipeg"}, + {"Canada/Eastern", "America/Toronto"}, + {"Canada/Mountain", "America/Edmonton"}, + {"Canada/Newfoundland", "America/St_Johns"}, + {"Canada/Pacific", "America/Vancouver"}, + {"Canada/Saskatchewan", "America/Regina"}, + {"Canada/Yukon", "America/Whitehorse"}, + {"Chile/Continental", "America/Santiago"}, + {"Chile/EasterIsland", "Pacific/Easter"}, + {"Cuba", "America/Havana"}, + {"Egypt", "Africa/Cairo"}, + {"Eire", "Europe/Dublin"}, + {"Etc/GMT+0", "Etc/GMT"}, + {"Etc/GMT-0", "Etc/GMT"}, + {"Etc/GMT0", "Etc/GMT"}, + {"Etc/Greenwich", "Etc/GMT"}, + {"Etc/UCT", "Etc/UTC"}, + {"Etc/Universal", "Etc/UTC"}, + {"Etc/Zulu", "Etc/UTC"}, + {"GB", "Europe/London"}, + {"GB-Eire", "Europe/London"}, + {"GMT+0", "Etc/GMT"}, + {"GMT-0", "Etc/GMT"}, + {"GMT0", "Etc/GMT"}, + {"Greenwich", "Etc/GMT"}, + {"Hongkong", "Asia/Hong_Kong"}, + {"Iceland", "Atlantic/Reykjavik"}, + {"Iran", "Asia/Tehran"}, + {"Israel", "Asia/Jerusalem"}, + {"Jamaica", "America/Jamaica"}, + {"Japan", "Asia/Tokyo"}, + {"Kwajalein", "Pacific/Kwajalein"}, + {"Libya", "Africa/Tripoli"}, + {"Mexico/BajaNorte", "America/Tijuana"}, + {"Mexico/BajaSur", "America/Mazatlan"}, + {"Mexico/General", "America/Mexico_City"}, + {"NZ", "Pacific/Auckland"}, + {"NZ-CHAT", "Pacific/Chatham"}, + {"Navajo", "America/Denver"}, + {"PRC", "Asia/Shanghai"}, + {"Poland", "Europe/Warsaw"}, + {"Portugal", "Europe/Lisbon"}, + {"ROC", "Asia/Taipei"}, + {"ROK", "Asia/Seoul"}, + {"Singapore", "Asia/Singapore"}, + {"Turkey", "Europe/Istanbul"}, + {"UCT", "Etc/UTC"}, + {"US/Alaska", "America/Anchorage"}, + {"US/Aleutian", "America/Adak"}, + {"US/Arizona", "America/Phoenix"}, + {"US/Central", "America/Chicago"}, + {"US/East-Indiana", "America/Indiana/Indianapolis"}, + {"US/Eastern", "America/New_York"}, + {"US/Hawaii", "Pacific/Honolulu"}, + {"US/Indiana-Starke", "America/Indiana/Knox"}, + {"US/Michigan", "America/Detroit"}, + {"US/Mountain", "America/Denver"}, + {"US/Pacific", "America/Los_Angeles"}, + {"US/Samoa", "Pacific/Pago_Pago"}, + {"UTC", "Etc/UTC"}, + {"Universal", "Etc/UTC"}, + {"W-SU", "Europe/Moscow"}, + {"Zulu", "Etc/UTC"}, + {"America/Buenos_Aires", "America/Argentina/Buenos_Aires"}, + {"America/Catamarca", "America/Argentina/Catamarca"}, + {"America/Cordoba", "America/Argentina/Cordoba"}, + {"America/Indianapolis", "America/Indiana/Indianapolis"}, + {"America/Jujuy", "America/Argentina/Jujuy"}, + {"America/Knox_IN", "America/Indiana/Knox"}, + {"America/Louisville", "America/Kentucky/Louisville"}, + {"America/Mendoza", "America/Argentina/Mendoza"}, + {"America/Virgin", "America/St_Thomas"}, + {"Pacific/Samoa", "Pacific/Pago_Pago"}, + {"Africa/Timbuktu", "Africa/Bamako"}, + {"America/Argentina/ComodRivadavia", "America/Argentina/Catamarca"}, + {"America/Atka", "America/Adak"}, + {"America/Coral_Harbour", "America/Atikokan"}, + {"America/Ensenada", "America/Tijuana"}, + {"America/Fort_Wayne", "America/Indiana/Indianapolis"}, + {"America/Montreal", "America/Toronto"}, + {"America/Nipigon", "America/Toronto"}, + {"America/Pangnirtung", "America/Iqaluit"}, + {"America/Porto_Acre", "America/Rio_Branco"}, + {"America/Rainy_River", "America/Winnipeg"}, + {"America/Rosario", "America/Argentina/Cordoba"}, + {"America/Santa_Isabel", "America/Tijuana"}, + {"America/Shiprock", "America/Denver"}, + {"America/Thunder_Bay", "America/Toronto"}, + {"America/Yellowknife", "America/Edmonton"}, + {"Antarctica/South_Pole", "Antarctica/McMurdo"}, + {"Asia/Chongqing", "Asia/Shanghai"}, + {"Asia/Harbin", "Asia/Shanghai"}, + {"Asia/Kashgar", "Asia/Urumqi"}, + {"Asia/Tel_Aviv", "Asia/Jerusalem"}, + {"Atlantic/Jan_Mayen", "Europe/Oslo"}, + {"Australia/Canberra", "Australia/Sydney"}, + {"Australia/Currie", "Australia/Hobart"}, + {"Europe/Belfast", "Europe/London"}, + {"Europe/Tiraspol", "Europe/Chisinau"}, + {"Europe/Uzhgorod", "Europe/Kyiv"}, + {"Europe/Zaporozhye", "Europe/Kyiv"}, + {"Pacific/Enderbury", "Pacific/Kanton"}, + {"Pacific/Johnston", "Pacific/Honolulu"}, + {"Pacific/Yap", "Pacific/Chuuk"}, + {"Africa/Asmera", "Africa/Asmara"}, + {"America/Godthab", "America/Nuuk"}, + {"Asia/Ashkhabad", "Asia/Ashgabat"}, + {"Asia/Calcutta", "Asia/Kolkata"}, + {"Asia/Chungking", "Asia/Shanghai"}, + {"Asia/Dacca", "Asia/Dhaka"}, + {"Asia/Istanbul", "Europe/Istanbul"}, + {"Asia/Katmandu", "Asia/Kathmandu"}, + {"Asia/Macao", "Asia/Macau"}, + {"Asia/Rangoon", "Asia/Yangon"}, + {"Asia/Saigon", "Asia/Ho_Chi_Minh"}, + {"Asia/Thimbu", "Asia/Thimphu"}, + {"Asia/Ujung_Pandang", "Asia/Makassar"}, + {"Asia/Ulan_Bator", "Asia/Ulaanbaatar"}, + {"Atlantic/Faeroe", "Atlantic/Faroe"}, + {"Europe/Kiev", "Europe/Kyiv"}, + {"Europe/Nicosia", "Asia/Nicosia"}, + {"Pacific/Ponape", "Pacific/Pohnpei"}, + {"Pacific/Truk", "Pacific/Chuuk"}, + }; + }()); + return *tzLinks; +} + +} // namespace facebook::velox::tz diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index 93dacadba677..b68c88c6be04 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -20,6 +20,16 @@ #include #include #include + +#include +#include +#include + +// The ICU libraries define TRUE/FALSE macros which frequently conflict with +// other libraries that use these as enum/variable names. +#undef TRUE +#undef FALSE + #include "velox/common/base/Exceptions.h" #include "velox/common/testutil/TestValue.h" #include "velox/external/date/tz.h" @@ -242,6 +252,21 @@ void validateRangeImpl(time_point timePoint) { } } +template +date::zoned_time getZonedTime( + const date::time_zone* tz, + date::local_time timestamp, + TimeZone::TChoose choose) { + if (choose == TimeZone::TChoose::kFail) { + // By default, throws. + return date::zoned_time{tz, timestamp}; + } + + auto dateChoose = (choose == TimeZone::TChoose::kEarliest) + ? date::choose::earliest + : date::choose::latest; + return date::zoned_time{tz, timestamp, dateChoose}; +} } // namespace void validateRange(time_point timePoint) { @@ -337,17 +362,7 @@ TimeZone::seconds TimeZone::to_sys( return (timePoint - offset_).time_since_epoch(); } - if (choose == TimeZone::TChoose::kFail) { - // By default, throws. - return date::zoned_time{tz_, timePoint}.get_sys_time().time_since_epoch(); - } - - auto dateChoose = (choose == TimeZone::TChoose::kEarliest) - ? date::choose::earliest - : date::choose::latest; - return date::zoned_time{tz_, timePoint, dateChoose} - .get_sys_time() - .time_since_epoch(); + return getZonedTime(tz_, timePoint, choose).get_sys_time().time_since_epoch(); } TimeZone::seconds TimeZone::to_local(TimeZone::seconds timestamp) const { @@ -361,4 +376,61 @@ TimeZone::seconds TimeZone::to_local(TimeZone::seconds timestamp) const { return date::zoned_time{tz_, timePoint}.get_local_time().time_since_epoch(); } +std::string TimeZone::getShortName( + TimeZone::milliseconds timestamp, + TimeZone::TChoose choose) const { + date::local_time timePoint{timestamp}; + validateRange(date::sys_time(timestamp)); + + // Time zone offsets only have one name (no abbreviations). + if (tz_ == nullptr) { + return timeZoneName_; + } + + return getZonedTime(tz_, timePoint, choose).get_info().abbrev; +} + +std::string TimeZone::getLongName( + TimeZone::milliseconds timestamp, + TimeZone::TChoose choose) const { + static const icu::Locale locale("en", "US"); + + validateRange(date::sys_time(timestamp)); + + // Time zone offsets only have one name. + if (tz_ == nullptr) { + return timeZoneName_; + } + + // Special case for UTC. ICU uses "GMT" for some reason which is an + // abbreviation. + if (timeZoneID_ == 0) { + return "Coordinated Universal Time"; + } + + // Get the ICU TimeZone by name + std::unique_ptr tz(icu::TimeZone::createTimeZone( + icu::UnicodeString(timeZoneName_.data(), timeZoneName_.length()))); + VELOX_USER_CHECK_NOT_NULL(tz); + + // According to the documentation this is how to determine if DST applies to + // a given timestamp in a given time zone. + // https://howardhinnant.github.io/date/tz.html#sys_info + date::local_time timePoint{timestamp}; + bool isDst = getZonedTime(tz_, timePoint, choose).get_info().save != + std::chrono::minutes(0); + + // Construct the long name for the time zone. + // Note that ICU does not have DST information for many time zones prior to + // 1970, so it's important to specify it explicitly. + icu::UnicodeString longName; + tz->getDisplayName( + isDst, icu::TimeZone::EDisplayType::LONG, locale, longName); + + // Convert the UnicodeString back to a string and write it out + std::string longNameStr; + longName.toUTF8String(longNameStr); + + return longNameStr; +} } // namespace facebook::velox::tz diff --git a/velox/type/tz/TimeZoneMap.h b/velox/type/tz/TimeZoneMap.h index 9554d7506328..c04cc308657b 100644 --- a/velox/type/tz/TimeZoneMap.h +++ b/velox/type/tz/TimeZoneMap.h @@ -113,6 +113,7 @@ class TimeZone { } using seconds = std::chrono::seconds; + using milliseconds = std::chrono::milliseconds; /// Converts a local time (the time as perceived in the user time zone /// represented by this object) to a system time (the corresponding time in @@ -151,6 +152,22 @@ class TimeZone { return tz_; } + /// Returns the short name (abbreviation) of the time zone for the given + /// timestamp. Note that the timestamp is needed for time zones that support + /// daylight savings time as the short name will change depending on the date + /// (e.g. PST/PDT). + std::string getShortName( + milliseconds timestamp, + TChoose choose = TChoose::kFail) const; + + /// Returns the long name of the time zone for the given timestamp, e.g. + /// Pacific Standard Time. Note that the timestamp is needed for time zones + /// that support daylight savings time as the long name will change depending + /// on the date (e.g. Pacific Standard Time vs Pacific Daylight Time). + std::string getLongName( + milliseconds timestamp, + TChoose choose = TChoose::kFail) const; + private: const date::time_zone* tz_{nullptr}; const std::chrono::minutes offset_{0}; diff --git a/velox/type/tz/gen_timezone_links.py b/velox/type/tz/gen_timezone_links.py new file mode 100755 index 000000000000..6b414b247c01 --- /dev/null +++ b/velox/type/tz/gen_timezone_links.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +from string import Template + +cpp_template = Template( + """\ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is generated. Do not modify it manually. To re-generate it, run: +// +// ./velox/type/tz/gen_timezone_links.py -f /tmp/backward \\ +// > velox/type/tz/TimeZoneLinks.cpp +// +// The backward file should be the same one used in JODA, +// The latest is available here : +// https://github.com/JodaOrg/global-tz/blob/global-tz/backward. +// Presto Java currently uses the one found here: +// https://github.com/JodaOrg/global-tz/blob/2024agtz/backward +// To find the current version in Presto Java, check this file at the version of +// JODA Presto Java is using: +// https://github.com/JodaOrg/joda-time/blob/v2.12.7/src/changes/changes.xml +// @generated + +#include +#include + +namespace facebook::velox::tz { + +const std::unordered_map& getTimeZoneLinks() { + static auto* tzLinks = new std::unordered_map([] { + // Work around clang compiler bug causing multi-hour compilation + // with -fsanitize=fuzzer + // https://github.com/llvm/llvm-project/issues/75666 + return std::unordered_map{ +$entries + }; + }()); + return *tzLinks; +} + +} // namespace facebook::velox::tz\ +""" +) + +entry_template = Template(' {"$tz_key", "$tz_value"},') + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Reads an input file (specified by -f) containing mappings from " + "time zone names to the names that should replace them and prints the " + "corresponding `TimeZoneLinks.cpp` file.", + epilog="(c) Facebook 2004-present", + ) + parser.add_argument( + "-f", + "--file-input", + required=True, + help="Input timezone links file.", + ) + return parser.parse_args() + + +def main(): + args = parse_arguments() + entries = [] + + # Read an input file with links following the format: + # + # > Link Australia/Sydney Australia/NSW + # > Link Australia/Darwin Australia/North + # > Link Australia/Brisbane Australia/Queensland + # > ... + # + # Possible containing comments (everything in a line after # is ignored) + + with open(args.file_input) as file_in: + for line in file_in: + line = line.strip() + + columns = line.split() + if len(columns) == 0: + continue + + if columns[0] != "Link": + continue + + entries.append( + entry_template.substitute(tz_key=columns[2], tz_value=columns[1]) + ) + + print(cpp_template.substitute(entries="\n".join(entries))) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index c56d6ef9c3ac..364fd2baf297 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -234,5 +234,54 @@ TEST(TimeZoneMapTest, invalid) { VELOX_ASSERT_THROW(getTimeZoneID("etc/GMT+300"), "Unknown time zone"); } +TEST(TimeZoneMapTest, getShortName) { + auto toShortName = [&](std::string_view name, size_t ts) { + const auto* tz = locateZone(name); + EXPECT_NE(tz, nullptr); + return tz->getShortName(milliseconds{ts}); + }; + + // Test an offset that maps to an actual time zone. + EXPECT_EQ("UTC", toShortName("+00:00", 0)); + + // Test offsets that do not map to named time zones. + EXPECT_EQ("+00:01", toShortName("+00:01", 0)); + EXPECT_EQ("-00:01", toShortName("-00:01", 0)); + EXPECT_EQ("+01:00", toShortName("+01:00", 0)); + EXPECT_EQ("-01:01", toShortName("-01:01", 0)); + + // In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07). + size_t ts = 1721890800000; + EXPECT_EQ("PDT", toShortName("America/Los_Angeles", ts)); + + // In "2024-01-01", it was not (UTC-08). + ts = 1704096000000; + EXPECT_EQ("PST", toShortName("America/Los_Angeles", ts)); +} + +TEST(TimeZoneMapTest, getLongName) { + auto toLongName = [&](std::string_view name, size_t ts) { + const auto* tz = locateZone(name); + EXPECT_NE(tz, nullptr); + return tz->getLongName(milliseconds{ts}); + }; + + // Test an offset that maps to an actual time zone. + EXPECT_EQ("Coordinated Universal Time", toLongName("+00:00", 0)); + + // Test offsets that do not map to named time zones. + EXPECT_EQ("+00:01", toLongName("+00:01", 0)); + EXPECT_EQ("-00:01", toLongName("-00:01", 0)); + EXPECT_EQ("+01:00", toLongName("+01:00", 0)); + EXPECT_EQ("-01:01", toLongName("-01:01", 0)); + + // In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07). + size_t ts = 1721890800000; + EXPECT_EQ("Pacific Daylight Time", toLongName("America/Los_Angeles", ts)); + + // In "2024-01-01", it was not (UTC-08). + ts = 1704096000000; + EXPECT_EQ("Pacific Standard Time", toLongName("America/Los_Angeles", ts)); +} } // namespace } // namespace facebook::velox::tz