From 36deaeb5adc7e7ff86f525ac55f79372ba4021ec Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Mon, 28 Oct 2024 10:00:28 -0700 Subject: [PATCH 1/6] Fix Presto's format_datetime function with time zone (#11283) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11283 The format_datetime currently outptus the time zone id if for 3 or fewer 'z' characters in the format string. However, the JODA library, which this is based on, does this for 3 or more 'Z' characters. https://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html This diff fixes this, as well as adds support for a single 'Z' (which outputs the same thing as 'ZZ' just without the colon). So 'Z' is fully supported for any number of characters. Differential Revision: D64500193 --- velox/functions/lib/DateTimeFormatter.cpp | 63 ++++++++++--------- .../prestosql/tests/DateTimeFunctionsTest.cpp | 29 ++++++--- .../types/TimestampWithTimeZoneType.cpp | 2 +- 3 files changed, 53 insertions(+), 41 deletions(-) diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index b86d3eb24b8a..d930f1ad6b93 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -518,7 +518,7 @@ std::string formatFractionOfSecond( return toAdd; } -int32_t appendTimezoneOffset(int64_t offset, char* result) { +int32_t appendTimezoneOffset(int64_t offset, char* result, bool includeColon) { int pos = 0; if (offset >= 0) { result[pos++] = '+'; @@ -536,7 +536,9 @@ int32_t appendTimezoneOffset(int64_t offset, char* result) { result[pos++] = char(hours % 10 + '0'); } - result[pos++] = ':'; + if (includeColon) { + result[pos++] = ':'; + } const auto minutes = (offset / 60) % 60; if LIKELY (minutes == 0) { @@ -1068,20 +1070,26 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { size += std::max((int)token.pattern.minRepresentDigits, 9); break; case DateTimeFormatSpecifier::TIMEZONE: - if (timezone == nullptr) { - VELOX_USER_FAIL("Timezone unknown"); - } - size += std::max( - token.pattern.minRepresentDigits, timezone->name().length()); + VELOX_NYI( + "Date format specifier is not yet implemented: {} ({})", + getSpecifierName(token.pattern.specifier), + token.pattern.minRepresentDigits); + break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: - if (token.pattern.minRepresentDigits != 2) { - VELOX_UNSUPPORTED( - "Date format specifier is not supported: {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); + if (token.pattern.minRepresentDigits == 1) { + // 'Z' means output the time zone offset without a colon. + size += 8; + } else if (token.pattern.minRepresentDigits == 2) { + // 'ZZ' means output the time zone offset with a colon. + size += 9; + } else { + // 'ZZZ' (or more) means otuput the time zone ID. + if (timezone == nullptr) { + VELOX_USER_FAIL("Timezone unknown"); + } + size += timezone->name().length(); } - size += 9; break; // Not supported. case DateTimeFormatSpecifier::WEEK_YEAR: @@ -1312,35 +1320,28 @@ int32_t DateTimeFormatter::format( case DateTimeFormatSpecifier::TIMEZONE: { // TODO: implement short name time zone, need a map from full name to // short name - if (token.pattern.minRepresentDigits <= 3) { - VELOX_UNSUPPORTED("short name time zone is not yet supported"); - } - if (timezone == nullptr) { - VELOX_USER_FAIL("Timezone unknown"); - } - const auto& piece = timezone->name(); - std::memcpy(result, piece.data(), piece.length()); - result += piece.length(); + VELOX_UNSUPPORTED("time zone name is not yet supported"); } break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: { // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset // with a colon, 'ZZZ' or more outputs the zone id. - // TODO Add support for 'Z' and 'ZZZ'. - if (token.pattern.minRepresentDigits != 2) { - VELOX_UNSUPPORTED( - "format is not supported for specifier {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); - } - if (offset == 0 && zeroOffsetText.has_value()) { std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size()); result += zeroOffsetText->size(); break; } - result += appendTimezoneOffset(offset, result); + if (token.pattern.minRepresentDigits >= 3) { + // Append the time zone ID. + const auto& piece = timezone->name(); + std::memcpy(result, piece.data(), piece.length()); + result += piece.length(); + break; + } + + result += appendTimezoneOffset( + offset, result, token.pattern.minRepresentDigits == 2); break; } case DateTimeFormatSpecifier::WEEK_OF_WEEK_YEAR: { diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 16ae8a77b414..e9d7c378eb10 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3214,11 +3214,18 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "0010", formatDatetime(parseTimestamp("2022-01-01 03:30:30.001"), "SSSS")); - // Time zone test cases - 'z' + // Time zone test cases - 'Z' setQueryTimeZone("Asia/Kolkata"); EXPECT_EQ( - "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + "Asia/Kolkata", + formatDatetime( + parseTimestamp("1970-01-01"), "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ")); + EXPECT_EQ( + "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "ZZZZ")); + EXPECT_EQ( + "Asia/Kolkata", formatDatetime(parseTimestamp("1970-01-01"), "ZZZ")); EXPECT_EQ("+05:30", formatDatetime(parseTimestamp("1970-01-01"), "ZZ")); + EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-01-01"), "Z")); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); @@ -3243,12 +3250,12 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "AD 19 1970 4 Thu 1970 1 1 1 AM 8 8 8 8 3 11 5 Asia/Kolkata", formatDatetime( parseTimestamp("1970-01-01 02:33:11.5"), - "G C Y e E y D M d a K h H k m s S zzzz")); + "G C Y e E y D M d a K h H k m s S ZZZ")); EXPECT_EQ( "AD 19 1970 4 asdfghjklzxcvbnmqwertyuiop Thu ' 1970 1 1 1 AM 8 8 8 8 3 11 5 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ Asia/Kolkata", formatDatetime( parseTimestamp("1970-01-01 02:33:11.5"), - "G C Y e 'asdfghjklzxcvbnmqwertyuiop' E '' y D M d a K h H k m s S 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ zzzz")); + "G C Y e 'asdfghjklzxcvbnmqwertyuiop' E '' y D M d a K h H k m s S 1234567890\\\"!@#$%^&*()-+`~{}[];:,./ ZZZ")); disableAdjustTimestampToTimezone(); EXPECT_EQ( @@ -3260,15 +3267,19 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "x"), VeloxUserError); EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "z"), VeloxUserError); + formatDatetime(parseTimestamp("1970-01-01"), "q"), VeloxUserError); EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zz"), VeloxUserError); + formatDatetime(parseTimestamp("1970-01-01"), "'abcd"), VeloxUserError); + + // System errors for patterns we haven't implemented yet. EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zzz"), VeloxUserError); + formatDatetime(parseTimestamp("1970-01-01"), "z"), VeloxRuntimeError); EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "q"), VeloxUserError); + formatDatetime(parseTimestamp("1970-01-01"), "zz"), VeloxRuntimeError); EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "'abcd"), VeloxUserError); + formatDatetime(parseTimestamp("1970-01-01"), "zzz"), VeloxRuntimeError); + EXPECT_THROW( + formatDatetime(parseTimestamp("1970-01-01"), "zzzz"), VeloxRuntimeError); } TEST_F(DateTimeFunctionsTest, formatDateTimeTimezone) { diff --git a/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp b/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp index f1865e55830e..de833dd77f2d 100644 --- a/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp +++ b/velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp @@ -110,7 +110,7 @@ void castToString( const auto* timestamps = input.as>(); auto expectedFormatter = - functions::buildJodaDateTimeFormatter("yyyy-MM-dd HH:mm:ss.SSS zzzz"); + functions::buildJodaDateTimeFormatter("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); VELOX_CHECK( !expectedFormatter.hasError(), "Default format should always be valid, error: {}", From 5ba662c4129b4640334265eb68c00d320f1f828f Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Mon, 28 Oct 2024 10:17:51 -0700 Subject: [PATCH 2/6] fix --- .../0006-add_get_time_zone_names.patch | 30 ++++ velox/external/date/tz.cpp | 8 ++ velox/external/date/tz.h | 2 + velox/functions/lib/DateTimeFormatter.cpp | 135 +++++++++++++++++- .../prestosql/tests/DateTimeFunctionsTest.cpp | 44 +++++- 5 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 velox/external/date/patches/0006-add_get_time_zone_names.patch diff --git a/velox/external/date/patches/0006-add_get_time_zone_names.patch b/velox/external/date/patches/0006-add_get_time_zone_names.patch new file mode 100644 index 000000000000..f7b60effe4f3 --- /dev/null +++ b/velox/external/date/patches/0006-add_get_time_zone_names.patch @@ -0,0 +1,30 @@ +diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp +--- a/velox/external/date/tz.cpp ++++ b/velox/external/date/tz.cpp +@@ -3538,6 +3538,14 @@ + return get_tzdb_list().front(); + } + ++std::vector get_time_zone_names() { ++ std::vector result; ++ for (const auto& z : get_tzdb().zones) { ++ result.push_back(z.name()); ++ } ++ return result; ++} ++ + const time_zone* + #if HAS_STRING_VIEW + tzdb::locate_zone(std::string_view tz_name) const +diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h +--- a/velox/external/date/tz.h ++++ b/velox/external/date/tz.h +@@ -1258,6 +1258,8 @@ + + DATE_API const tzdb& get_tzdb(); + ++std::vector get_time_zone_names(); ++ + class tzdb_list + { + std::atomic head_{nullptr}; diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp index 69513d7d3145..13ebe93561da 100644 --- a/velox/external/date/tz.cpp +++ b/velox/external/date/tz.cpp @@ -3538,6 +3538,14 @@ get_tzdb() return get_tzdb_list().front(); } +std::vector get_time_zone_names() { + std::vector result; + for (const auto& z : get_tzdb().zones) { + result.push_back(z.name()); + } + return result; +} + const time_zone* #if HAS_STRING_VIEW tzdb::locate_zone(std::string_view tz_name) const diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h index 4ec0dbb44cfd..aa6d42c8d359 100644 --- a/velox/external/date/tz.h +++ b/velox/external/date/tz.h @@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db); DATE_API const tzdb& get_tzdb(); +std::vector get_time_zone_names(); + class tzdb_list { std::atomic head_{nullptr}; diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index d930f1ad6b93..89215fe91704 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -351,6 +351,133 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) { return -1; } +// Contains a list of all time zone names in a convenient format for searching. +// +// Time zone names without the '/' character (without a prefix) are stored in +// timeZoneNamesWithoutPrefix ordered by size desc. +// +// Time zone names with the '/' character (with a prefix) are stored in a map +// timeZoneNamePrefixMap from prefix (the string before the first '/') to a +// vector of strings which contains the suffixes (the strings after the first +// '/') ordered by size desc. +struct TimeZoneNameMappings { + std::vector timeZoneNamesWithoutPrefix; + std::unordered_map> + timeZoneNamePrefixMap; +}; + +TimeZoneNameMappings getTimeZoneNameMappings() { + // Here we use get_time_zone_names instead of calling get_tzdb and + // constructing the list ourselves because there is some unknown issue with + // the tz library where the time_zone objects after the first one in the tzdb + // will be invalid (contain nullptrs) after the get_tzdb function returns. + const std::vector timeZoneNames = date::get_time_zone_names(); + + TimeZoneNameMappings result; + for (size_t i = 0; i < timeZoneNames.size(); i++) { + const auto& timeZoneName = timeZoneNames[i]; + auto separatorPoint = timeZoneName.find('/'); + + if (separatorPoint == std::string::npos) { + result.timeZoneNamesWithoutPrefix.push_back(timeZoneName); + } else { + std::string prefix = timeZoneName.substr(0, separatorPoint); + std::string suffix = timeZoneName.substr(separatorPoint + 1); + + result.timeZoneNamePrefixMap[prefix].push_back(suffix); + } + } + + std::sort( + result.timeZoneNamesWithoutPrefix.begin(), + result.timeZoneNamesWithoutPrefix.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + + for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) { + std::sort( + suffixes.begin(), + suffixes.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + } + + return result; +} + +int64_t parseTimezoneName(const char* cur, const char* end, Date& date) { + // For time zone names we try to greedily find the longest substring starting + // from cur that is a valid time zone name. To help speed things along we + // treat time zone names as {prefix}/{suffix} (for the first instance of '/') + // and create lists of suffixes per prefix. We order these lists by length of + // the suffix so once we identify the prefix, we can return the first suffix + // we find in the string. We treat time zone names without a prefix (i.e. + // without a '/') separately but similarly. + static const TimeZoneNameMappings timeZoneNameMappings = + getTimeZoneNameMappings(); + + if (cur < end) { + // Find the first instance of '/' in the remainder of the string + const char* separatorPoint = cur; + while (separatorPoint < end && *separatorPoint != '/') { + ++separatorPoint; + } + + // Try to find a time zone with a prefix that includes the speratorPoint. + if (separatorPoint != end) { + std::string prefix(cur, separatorPoint); + + auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix); + if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) { + // This is greedy, find the longest suffix for the given prefix that + // fits the string. We know the value in the map is already sorted by + // length in decreasing order. + for (const auto& suffixName : it->second) { + if (suffixName.size() <= end - separatorPoint - 1 && + suffixName == + std::string_view(separatorPoint + 1, suffixName.size())) { + auto timeZoneNameSize = prefix.size() + 1 + suffixName.size(); + date.timezone = + tz::locateZone(std::string_view(cur, timeZoneNameSize), false); + + if (!date.timezone) { + return -1; + } + + return timeZoneNameSize; + } + } + } + } + + // If we found a '/' but didn't find a match in the set of time zones with + // prefixes, try search before the '/' for a time zone without a prefix. If + // we didn't find a '/' then end already equals separatorPoint. + end = separatorPoint; + + for (const auto& timeZoneName : + timeZoneNameMappings.timeZoneNamesWithoutPrefix) { + // Again, this is greedy, find the largest time zone name without a prefix + // that fits the string. We know timeZoneNamesWithoutPrefix is already + // sorted by length in decreasing order. + if (timeZoneName.size() <= end - cur && + timeZoneName == std::string_view(cur, timeZoneName.size())) { + date.timezone = tz::locateZone(timeZoneName, false); + + if (!date.timezone) { + return -1; + } + + return timeZoneName.size(); + } + } + } + + return -1; +} + int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) { // For timezone offset ids, there are three formats allowed by Joda: // @@ -689,7 +816,13 @@ int32_t parseFromPattern( bool specifierNext, DateTimeFormatterType type) { if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) { - auto size = parseTimezoneOffset(cur, end, date); + int64_t size; + if (curPattern.minRepresentDigits < 3) { + size = parseTimezoneOffset(cur, end, date); + } else { + size = parseTimezoneName(cur, end, date); + } + if (size == -1) { return -1; } diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index e9d7c378eb10..97807df92a0e 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -2976,10 +2976,52 @@ TEST_F(DateTimeFunctionsTest, parseDatetime) { ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); EXPECT_EQ( ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); - + // Test a time zone with a prefix. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "2024-02-25+06:00:99 America/Los_Angeles", + "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix is greedy. Etc/GMT-1 and Etc/GMT-10 are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708804800000, "Etc/GMT-10"), + parseDatetime( + "2024-02-25+06:00:99 Etc/GMT-10", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone without a prefix is greedy. NZ and NZ-CHAT are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708791300000, "NZ-CHAT"), + parseDatetime("2024-02-25+06:00:99 NZ-CHAT", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "America/Los_Angeles2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test a time zone without a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test parsing can fall back to checking for time zones without a prefix when + // a '/' is present but not part of the time zone name. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT/2024-02-25+06:00:99", "ZZZ/yyyy-MM-dd+HH:mm:99")); + + // Test an invalid time zone without a prefix. (zzz should be used to match + // abbreviations) VELOX_ASSERT_THROW( parseDatetime("2024-02-25+06:00:99 PST", "yyyy-MM-dd+HH:mm:99 ZZZ"), "Invalid date format: '2024-02-25+06:00:99 PST'"); + // Test an invalid time zone with a prefix that doesn't appear at all. + VELOX_ASSERT_THROW( + parseDatetime("2024-02-25+06:00:99 ABC/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 ABC/XYZ'"); + // Test an invalid time zone with a prefix that does appear. + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 America/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 America/XYZ'"); } TEST_F(DateTimeFunctionsTest, formatDateTime) { From b519f88e6b209d8164a53820d8e5fa73d32fc6d4 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Mon, 28 Oct 2024 11:18:35 -0700 Subject: [PATCH 3/6] format datetime tz abbrev --- velox/functions/lib/DateTimeFormatter.cpp | 31 ++++++++++---- .../prestosql/tests/DateTimeFunctionsTest.cpp | 33 ++++++++++++--- velox/type/tz/TimeZoneMap.cpp | 40 ++++++++++++++----- velox/type/tz/TimeZoneMap.h | 9 +++++ velox/type/tz/tests/TimeZoneMapTest.cpp | 25 ++++++++++++ 5 files changed, 114 insertions(+), 24 deletions(-) diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 89215fe91704..ec4f973213ad 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1203,10 +1203,17 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { size += std::max((int)token.pattern.minRepresentDigits, 9); break; case DateTimeFormatSpecifier::TIMEZONE: - VELOX_NYI( - "Date format specifier is not yet implemented: {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); + if (token.pattern.minRepresentDigits <= 3) { + // The longest abbreviation according to here is 5, e.g. some time + // zones use the offset as the abbreviation, like +0530. + // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + size += 5; + } else { + VELOX_NYI( + "Date format specifier is not yet implemented: {} ({})", + getSpecifierName(token.pattern.specifier), + token.pattern.minRepresentDigits); + } break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: @@ -1451,9 +1458,19 @@ int32_t DateTimeFormatter::format( } break; case DateTimeFormatSpecifier::TIMEZONE: { - // TODO: implement short name time zone, need a map from full name to - // short name - VELOX_UNSUPPORTED("time zone name is not yet supported"); + VELOX_USER_CHECK_NOT_NULL( + timezone, + "The time zone cannot be formatted if it is not present."); + if (token.pattern.minRepresentDigits <= 3) { + const std::string& abbrev = timezone->getShortName( + std::chrono::milliseconds(timestamp.toMillis()), + tz::TimeZone::TChoose::kEarliest); + std::memcpy(result, abbrev.data(), abbrev.length()); + result += abbrev.length(); + } else { + // TODO: implement full name time zone + VELOX_NYI("full time zone name is not yet supported"); + } } break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: { diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 97807df92a0e..afa4f5a8ac91 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3269,6 +3269,28 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_EQ("+05:30", formatDatetime(parseTimestamp("1970-01-01"), "ZZ")); EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-01-01"), "Z")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zz")); + EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + + // Test daylight savings. + setQueryTimeZone("America/Los_Angeles"); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-03-10 01:00"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-03-10 03:00"), "z")); + EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-11-03 01:00"), "z")); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-11-03 02:00"), "z")); + + // Test a long abbreviation. + setQueryTimeZone("Asia/Colombo"); + EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z")); + + setQueryTimeZone("Asia/Kolkata"); + // We don't support more than 3 'z's yet. + EXPECT_THROW( + formatDatetime(parseTimestamp("1970-01-01"), "zzzz"), VeloxRuntimeError); + // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); EXPECT_EQ("'", formatDatetime(parseTimestamp("1970-01-01"), "''")); @@ -3313,15 +3335,14 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_THROW( formatDatetime(parseTimestamp("1970-01-01"), "'abcd"), VeloxUserError); - // System errors for patterns we haven't implemented yet. + // Time zone name patterns aren't supported when there isn't a time zone + // available. EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "z"), VeloxRuntimeError); + formatDatetime(parseTimestamp("1970-01-01"), "z"), VeloxUserError); EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zz"), VeloxRuntimeError); + formatDatetime(parseTimestamp("1970-01-01"), "zz"), VeloxUserError); EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zzz"), VeloxRuntimeError); - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zzzz"), VeloxRuntimeError); + formatDatetime(parseTimestamp("1970-01-01"), "zzz"), VeloxUserError); } TEST_F(DateTimeFunctionsTest, formatDateTimeTimezone) { diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index 93dacadba677..2b7e8968ed04 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -242,6 +242,21 @@ void validateRangeImpl(time_point timePoint) { } } +template +date::zoned_time getZonedTime( + const date::time_zone* tz, + date::local_time timestamp, + TimeZone::TChoose choose) { + if (choose == TimeZone::TChoose::kFail) { + // By default, throws. + return date::zoned_time{tz, timestamp}; + } + + auto dateChoose = (choose == TimeZone::TChoose::kEarliest) + ? date::choose::earliest + : date::choose::latest; + return date::zoned_time{tz, timestamp, dateChoose}; +} } // namespace void validateRange(time_point timePoint) { @@ -337,17 +352,7 @@ TimeZone::seconds TimeZone::to_sys( return (timePoint - offset_).time_since_epoch(); } - if (choose == TimeZone::TChoose::kFail) { - // By default, throws. - return date::zoned_time{tz_, timePoint}.get_sys_time().time_since_epoch(); - } - - auto dateChoose = (choose == TimeZone::TChoose::kEarliest) - ? date::choose::earliest - : date::choose::latest; - return date::zoned_time{tz_, timePoint, dateChoose} - .get_sys_time() - .time_since_epoch(); + return getZonedTime(tz_, timePoint, choose).get_sys_time().time_since_epoch(); } TimeZone::seconds TimeZone::to_local(TimeZone::seconds timestamp) const { @@ -361,4 +366,17 @@ TimeZone::seconds TimeZone::to_local(TimeZone::seconds timestamp) const { return date::zoned_time{tz_, timePoint}.get_local_time().time_since_epoch(); } +std::string TimeZone::getShortName( + TimeZone::milliseconds timestamp, + TimeZone::TChoose choose) const { + date::local_time timePoint{timestamp}; + validateRange(date::sys_time(timestamp)); + + // Time zone offsets only have one name (no abbreviations). + if (tz_ == nullptr) { + return timeZoneName_; + } + + return getZonedTime(tz_, timePoint, choose).get_info().abbrev; +} } // namespace facebook::velox::tz diff --git a/velox/type/tz/TimeZoneMap.h b/velox/type/tz/TimeZoneMap.h index 9554d7506328..c7e5c0f6087e 100644 --- a/velox/type/tz/TimeZoneMap.h +++ b/velox/type/tz/TimeZoneMap.h @@ -113,6 +113,7 @@ class TimeZone { } using seconds = std::chrono::seconds; + using milliseconds = std::chrono::milliseconds; /// Converts a local time (the time as perceived in the user time zone /// represented by this object) to a system time (the corresponding time in @@ -151,6 +152,14 @@ class TimeZone { return tz_; } + /// Returns the short name (abbreviation) of the time zone for the given + /// timestamp. Note that the timestamp is needed for time zones that support + /// daylight savings time as the short name will change depending on the date + /// (e.g. PST/PDT). + std::string getShortName( + milliseconds timestamp, + TChoose choose = TChoose::kFail) const; + private: const date::time_zone* tz_{nullptr}; const std::chrono::minutes offset_{0}; diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index c56d6ef9c3ac..0d0fa21cb43c 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -234,5 +234,30 @@ TEST(TimeZoneMapTest, invalid) { VELOX_ASSERT_THROW(getTimeZoneID("etc/GMT+300"), "Unknown time zone"); } +TEST(TimeZoneMapTest, getShortName) { + auto toShortName = [&](std::string_view name, size_t ts) { + const auto* tz = locateZone(name); + EXPECT_NE(tz, nullptr); + return tz->getShortName(milliseconds{ts}); + }; + + // Test an offset that maps to an actual time zone. + EXPECT_EQ("UTC", toShortName("+00:00", 0)); + + // Test offsets that do not map to named time zones. + EXPECT_EQ("+00:01", toShortName("+00:01", 0)); + EXPECT_EQ("-00:01", toShortName("-00:01", 0)); + EXPECT_EQ("+01:00", toShortName("+01:00", 0)); + EXPECT_EQ("-01:01", toShortName("-01:01", 0)); + + // In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07). + size_t ts = 1721890800000; + EXPECT_EQ("PDT", toShortName("America/Los_Angeles", ts)); + + // In "2024-01-01", it was not (UTC-08). + ts = 1704096000000; + EXPECT_EQ("PST", toShortName("America/Los_Angeles", ts)); +} + } // namespace } // namespace facebook::velox::tz From f7fb8d17248eb005caf76b727547616b7dbafc56 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Mon, 28 Oct 2024 20:08:30 -0700 Subject: [PATCH 4/6] date format long time zone name --- .github/workflows/linux-build.yml | 1 + CMake/resolve_dependency_modules/boost.cmake | 29 +++------- CMakeLists.txt | 19 +++++++ scripts/setup-ubuntu.sh | 1 - velox/functions/lib/CMakeLists.txt | 3 +- velox/functions/lib/DateTimeFormatter.cpp | 15 +++--- .../prestosql/tests/DateTimeFunctionsTest.cpp | 45 ++++++++++++++-- velox/type/tz/CMakeLists.txt | 4 +- velox/type/tz/TimeZoneMap.cpp | 54 +++++++++++++++++++ velox/type/tz/TimeZoneMap.h | 8 +++ velox/type/tz/tests/TimeZoneMapTest.cpp | 24 +++++++++ 11 files changed, 167 insertions(+), 36 deletions(-) diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index b40b9af49356..476c786a63e3 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -177,6 +177,7 @@ jobs: - name: Make Debug Build env: VELOX_DEPENDENCY_SOURCE: BUNDLED + ICU_SOURCE: SYSTEM MAKEFLAGS: "NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=3" EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_PARQUET=ON" run: | diff --git a/CMake/resolve_dependency_modules/boost.cmake b/CMake/resolve_dependency_modules/boost.cmake index bdf6633d0375..842cba6f1334 100644 --- a/CMake/resolve_dependency_modules/boost.cmake +++ b/CMake/resolve_dependency_modules/boost.cmake @@ -13,30 +13,13 @@ # limitations under the License. include_guard(GLOBAL) -if(CMAKE_SYSTEM_NAME MATCHES "Darwin") - if(ON_APPLE_M1) - list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") - else() - list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") - endif() -endif() - -# ICU is only needed with Boost build from source -set_source(ICU) -resolve_dependency( - ICU - COMPONENTS - data - i18n - io - uc - tu - test) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/boost) -if(${ICU_SOURCE} STREQUAL "BUNDLED") - # ensure ICU is built before Boost - add_dependencies(boost_regex ICU ICU::i18n) + +if(ICU_SOURCE) + if(${ICU_SOURCE} STREQUAL "BUNDLED") + # ensure ICU is built before Boost + add_dependencies(boost_regex ICU ICU::i18n) + endif() endif() # This prevents system boost from leaking in diff --git a/CMakeLists.txt b/CMakeLists.txt index 97ba92170c0f..168eee277044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,6 +385,25 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + if(ON_APPLE_M1) + list(APPEND CMAKE_PREFIX_PATH "/opt/homebrew/opt/icu4c") + else() + list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") + endif() +endif() + +set_source(ICU) +resolve_dependency( + ICU + COMPONENTS + data + i18n + io + uc + tu + test) + set(BOOST_INCLUDE_LIBRARIES atomic context diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index 5874e98e9651..31ad1a9ae846 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -297,4 +297,3 @@ function install_apt_deps { fi fi ) - diff --git a/velox/functions/lib/CMakeLists.txt b/velox/functions/lib/CMakeLists.txt index bdff97bba2d9..5cee3af311d1 100644 --- a/velox/functions/lib/CMakeLists.txt +++ b/velox/functions/lib/CMakeLists.txt @@ -24,7 +24,8 @@ velox_link_libraries(velox_functions_util velox_vector velox_common_base) velox_add_library(velox_functions_lib_date_time_formatter DateTimeFormatter.cpp DateTimeFormatterBuilder.cpp) -velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz) +velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz + ICU::i18n ICU::uc) velox_add_library( velox_functions_lib diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index ec4f973213ad..5676a63efc8c 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1209,10 +1209,10 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones size += 5; } else { - VELOX_NYI( - "Date format specifier is not yet implemented: {} ({})", - getSpecifierName(token.pattern.specifier), - token.pattern.minRepresentDigits); + // The longest time zone long name is 40, Australian Central Western + // Standard Time. + // https://www.timeanddate.com/time/zones/ + size += 50; } break; @@ -1468,8 +1468,11 @@ int32_t DateTimeFormatter::format( std::memcpy(result, abbrev.data(), abbrev.length()); result += abbrev.length(); } else { - // TODO: implement full name time zone - VELOX_NYI("full time zone name is not yet supported"); + std::string longName = timezone->getLongName( + std::chrono::milliseconds(timestamp.toMillis()), + tz::TimeZone::TChoose::kEarliest); + std::memcpy(result, longName.data(), longName.length()); + result += longName.length(); } } break; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index afa4f5a8ac91..30090173d8f8 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3272,6 +3272,12 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "zz")); EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-01-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzzzzzzzzzzzzzzzzzzzz")); // Test daylight savings. setQueryTimeZone("America/Los_Angeles"); @@ -3281,16 +3287,47 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-03-10 03:00"), "z")); EXPECT_EQ("PDT", formatDatetime(parseTimestamp("2024-11-03 01:00"), "z")); EXPECT_EQ("PST", formatDatetime(parseTimestamp("2024-11-03 02:00"), "z")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-03-10 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-03-10 03:00"), "zzzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:00"), "zzzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("2024-11-03 02:00"), "zzzz")); + + // Test ambiguous time. + EXPECT_EQ( + "PDT", formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzz")); + EXPECT_EQ( + "Pacific Daylight Time", + formatDatetime(parseTimestamp("2024-11-03 01:30:00"), "zzzz")); // Test a long abbreviation. setQueryTimeZone("Asia/Colombo"); EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "India Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); - setQueryTimeZone("Asia/Kolkata"); - // We don't support more than 3 'z's yet. - EXPECT_THROW( - formatDatetime(parseTimestamp("1970-01-01"), "zzzz"), VeloxRuntimeError); + // Test a long long name. + setQueryTimeZone("Australia/Eucla"); + EXPECT_EQ("+0845", formatDatetime(parseTimestamp("1970-10-01"), "z")); + EXPECT_EQ( + "Australian Central Western Standard Time", + formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + setQueryTimeZone("Asia/Kolkata"); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); EXPECT_EQ("'", formatDatetime(parseTimestamp("1970-01-01"), "''")); diff --git a/velox/type/tz/CMakeLists.txt b/velox/type/tz/CMakeLists.txt index b7d96feb1262..614b594000a7 100644 --- a/velox/type/tz/CMakeLists.txt +++ b/velox/type/tz/CMakeLists.txt @@ -23,4 +23,6 @@ velox_link_libraries( velox_external_date Boost::regex fmt::fmt - Folly::folly) + Folly::folly + ICU::i18n + ICU::uc) diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index 2b7e8968ed04..b68c88c6be04 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -20,6 +20,16 @@ #include #include #include + +#include +#include +#include + +// The ICU libraries define TRUE/FALSE macros which frequently conflict with +// other libraries that use these as enum/variable names. +#undef TRUE +#undef FALSE + #include "velox/common/base/Exceptions.h" #include "velox/common/testutil/TestValue.h" #include "velox/external/date/tz.h" @@ -379,4 +389,48 @@ std::string TimeZone::getShortName( return getZonedTime(tz_, timePoint, choose).get_info().abbrev; } + +std::string TimeZone::getLongName( + TimeZone::milliseconds timestamp, + TimeZone::TChoose choose) const { + static const icu::Locale locale("en", "US"); + + validateRange(date::sys_time(timestamp)); + + // Time zone offsets only have one name. + if (tz_ == nullptr) { + return timeZoneName_; + } + + // Special case for UTC. ICU uses "GMT" for some reason which is an + // abbreviation. + if (timeZoneID_ == 0) { + return "Coordinated Universal Time"; + } + + // Get the ICU TimeZone by name + std::unique_ptr tz(icu::TimeZone::createTimeZone( + icu::UnicodeString(timeZoneName_.data(), timeZoneName_.length()))); + VELOX_USER_CHECK_NOT_NULL(tz); + + // According to the documentation this is how to determine if DST applies to + // a given timestamp in a given time zone. + // https://howardhinnant.github.io/date/tz.html#sys_info + date::local_time timePoint{timestamp}; + bool isDst = getZonedTime(tz_, timePoint, choose).get_info().save != + std::chrono::minutes(0); + + // Construct the long name for the time zone. + // Note that ICU does not have DST information for many time zones prior to + // 1970, so it's important to specify it explicitly. + icu::UnicodeString longName; + tz->getDisplayName( + isDst, icu::TimeZone::EDisplayType::LONG, locale, longName); + + // Convert the UnicodeString back to a string and write it out + std::string longNameStr; + longName.toUTF8String(longNameStr); + + return longNameStr; +} } // namespace facebook::velox::tz diff --git a/velox/type/tz/TimeZoneMap.h b/velox/type/tz/TimeZoneMap.h index c7e5c0f6087e..c04cc308657b 100644 --- a/velox/type/tz/TimeZoneMap.h +++ b/velox/type/tz/TimeZoneMap.h @@ -160,6 +160,14 @@ class TimeZone { milliseconds timestamp, TChoose choose = TChoose::kFail) const; + /// Returns the long name of the time zone for the given timestamp, e.g. + /// Pacific Standard Time. Note that the timestamp is needed for time zones + /// that support daylight savings time as the long name will change depending + /// on the date (e.g. Pacific Standard Time vs Pacific Daylight Time). + std::string getLongName( + milliseconds timestamp, + TChoose choose = TChoose::kFail) const; + private: const date::time_zone* tz_{nullptr}; const std::chrono::minutes offset_{0}; diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index 0d0fa21cb43c..364fd2baf297 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -259,5 +259,29 @@ TEST(TimeZoneMapTest, getShortName) { EXPECT_EQ("PST", toShortName("America/Los_Angeles", ts)); } +TEST(TimeZoneMapTest, getLongName) { + auto toLongName = [&](std::string_view name, size_t ts) { + const auto* tz = locateZone(name); + EXPECT_NE(tz, nullptr); + return tz->getLongName(milliseconds{ts}); + }; + + // Test an offset that maps to an actual time zone. + EXPECT_EQ("Coordinated Universal Time", toLongName("+00:00", 0)); + + // Test offsets that do not map to named time zones. + EXPECT_EQ("+00:01", toLongName("+00:01", 0)); + EXPECT_EQ("-00:01", toLongName("-00:01", 0)); + EXPECT_EQ("+01:00", toLongName("+01:00", 0)); + EXPECT_EQ("-01:01", toLongName("-01:01", 0)); + + // In "2024-07-25", America/Los_Angeles was in daylight savings time (UTC-07). + size_t ts = 1721890800000; + EXPECT_EQ("Pacific Daylight Time", toLongName("America/Los_Angeles", ts)); + + // In "2024-01-01", it was not (UTC-08). + ts = 1704096000000; + EXPECT_EQ("Pacific Standard Time", toLongName("America/Los_Angeles", ts)); +} } // namespace } // namespace facebook::velox::tz From 601918157a1391035a3479f656dff45e6ea20b72 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Mon, 28 Oct 2024 20:08:31 -0700 Subject: [PATCH 5/6] fix --- velox/functions/lib/DateTimeFormatter.cpp | 6 ++++++ .../prestosql/tests/DateTimeFunctionsTest.cpp | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 5676a63efc8c..32e1ab68f060 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -828,6 +828,12 @@ int32_t parseFromPattern( } cur += size; } else if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE) { + // JODA does not support parsing time zone long names, so neither do we for + // consistency. The pattern for a time zone long name is 4 or more 'z's. + VELOX_USER_CHECK_LT( + curPattern.minRepresentDigits, + 4, + "Parsing time zone long names is not supported."); auto size = parseTimezone(cur, end, date); if (size == -1) { return -1; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 30090173d8f8..9cce58653252 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3022,6 +3022,19 @@ TEST_F(DateTimeFunctionsTest, parseDatetime) { parseDatetime( "2024-02-25+06:00:99 America/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), "Invalid date format: '2024-02-25+06:00:99 America/XYZ'"); + + // Test to ensure we do not support parsing time zone long names (to be + // consistent with JODA). + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 Pacific Standard Time", + "yyyy-MM-dd+HH:mm:99 zzzz"), + "Parsing time zone long names is not supported."); + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 Pacific Standard Time", + "yyyy-MM-dd+HH:mm:99 zzzzzzzzzz"), + "Parsing time zone long names is not supported."); } TEST_F(DateTimeFunctionsTest, formatDateTime) { From 547c86acd535e66778ae649841bc7ad4769beb1f Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Tue, 29 Oct 2024 11:07:48 -0700 Subject: [PATCH 6/6] Fix format_datetime to used linked TimeZone names (#11337) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11337 JODA has this concept of linked Time Zones where given a particular time zone ID, it will write out a different time zone ID when formatting a timestamp as a string (I suspect this is for historical reasons). More information is available about this here https://github.com/JodaOrg/global-tz This diff adds a script to parse the file JODA uses to get these links and turn it into a static map. I've done this for the file in the version of JODA Presto Java currently uses (there have been significant updates in more recent versions). Note that the file guarantees there are no transitive links so we don't need to handle that case. I then updated the format_datetime UDF to use this mapping for the ZZZ (or more) pattern. It will use the linked time zone ID in place of the time zone ID if one is available. This is also fixes casting TimestampWithTimeZone to Varchar which uses the same code path. Reviewed By: xiaoxmeng Differential Revision: D64870014 --- velox/functions/lib/DateTimeFormatter.cpp | 19 +- .../prestosql/tests/DateTimeFunctionsTest.cpp | 11 ++ .../tests/TimestampWithTimeZoneCastTest.cpp | 3 +- velox/type/tz/CMakeLists.txt | 8 +- velox/type/tz/TimeZoneLinks.cpp | 186 ++++++++++++++++++ velox/type/tz/gen_timezone_links.py | 127 ++++++++++++ 6 files changed, 350 insertions(+), 4 deletions(-) create mode 100644 velox/type/tz/TimeZoneLinks.cpp create mode 100755 velox/type/tz/gen_timezone_links.py diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 32e1ab68f060..598146f28812 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -27,6 +27,11 @@ #include "velox/type/TimestampConversion.h" #include "velox/type/tz/TimeZoneMap.h" +namespace facebook::velox::tz { +// Defined in TimeZoneLinks.cpp +extern const std::unordered_map& getTimeZoneLinks(); +} // namespace facebook::velox::tz + namespace facebook::velox::functions { static thread_local std::string timezoneBuffer = "+00:00"; @@ -1234,7 +1239,9 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { if (timezone == nullptr) { VELOX_USER_FAIL("Timezone unknown"); } - size += timezone->name().length(); + + // The longest time zone ID is 32, America/Argentina/ComodRivadavia. + size += 32; } break; // Not supported. @@ -1494,6 +1501,16 @@ int32_t DateTimeFormatter::format( if (token.pattern.minRepresentDigits >= 3) { // Append the time zone ID. const auto& piece = timezone->name(); + + static const auto& timeZoneLinks = tz::getTimeZoneLinks(); + auto timeZoneLinksIter = timeZoneLinks.find(piece); + if (timeZoneLinksIter != timeZoneLinks.end()) { + const auto& timeZoneLink = timeZoneLinksIter->second; + std::memcpy(result, timeZoneLink.data(), timeZoneLink.length()); + result += timeZoneLink.length(); + break; + } + std::memcpy(result, piece.data(), piece.length()); result += piece.length(); break; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 9cce58653252..20265004aafa 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3340,6 +3340,17 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "Australian Central Western Standard Time", formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + // Test a time zone name that is linked to another (that gets replaced when + // converted to a string). + setQueryTimeZone("US/Pacific"); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "America/Los_Angeles", + formatDatetime(parseTimestamp("1970-01-01"), "ZZZ")); + setQueryTimeZone("Asia/Kolkata"); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); diff --git a/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp b/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp index 6b9990ada924..ddd100c6b46a 100644 --- a/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp +++ b/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp @@ -125,7 +125,8 @@ TEST_F(TimestampWithTimeZoneCastTest, toVarchar) { "1970-01-01 01:11:37.123 America/New_York", "1969-12-31 22:11:37.123 America/Los_Angeles", "1970-01-01 14:11:37.123 Asia/Shanghai", - "1970-01-01 11:41:37.123 Asia/Calcutta", + "1970-01-01 11:41:37.123 Asia/Kolkata", // Asia/Calcutta is linked to + // Asia/Kolkata. }); auto result = evaluate("cast(c0 as varchar)", makeRowVector({input})); diff --git a/velox/type/tz/CMakeLists.txt b/velox/type/tz/CMakeLists.txt index 614b594000a7..8f7c2381b823 100644 --- a/velox/type/tz/CMakeLists.txt +++ b/velox/type/tz/CMakeLists.txt @@ -14,8 +14,12 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -velox_add_library(velox_type_tz TimeZoneMap.h TimeZoneDatabase.cpp - TimeZoneMap.cpp) +velox_add_library( + velox_type_tz + TimeZoneMap.h + TimeZoneDatabase.cpp + TimeZoneLinks.cpp + TimeZoneMap.cpp) velox_link_libraries( velox_type_tz diff --git a/velox/type/tz/TimeZoneLinks.cpp b/velox/type/tz/TimeZoneLinks.cpp new file mode 100644 index 000000000000..de1e381ceb91 --- /dev/null +++ b/velox/type/tz/TimeZoneLinks.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is generated. Do not modify it manually. To re-generate it, run: +// +// ./velox/type/tz/gen_timezone_links.py -f /tmp/backward \ +// > velox/type/tz/TimeZoneLinks.cpp +// +// The backward file should be the same one used in JODA, +// The latest is available here : +// https://github.com/JodaOrg/global-tz/blob/global-tz/backward. +// Presto Java currently uses the one found here: +// https://github.com/JodaOrg/global-tz/blob/2024agtz/backward +// To find the current version in Presto Java, check this file at the version of +// JODA Presto Java is using: +// https://github.com/JodaOrg/joda-time/blob/v2.12.7/src/changes/changes.xml +// @generated + +#include +#include + +namespace facebook::velox::tz { + +const std::unordered_map& getTimeZoneLinks() { + static auto* tzLinks = new std::unordered_map([] { + // Work around clang compiler bug causing multi-hour compilation + // with -fsanitize=fuzzer + // https://github.com/llvm/llvm-project/issues/75666 + return std::unordered_map{ + {"Australia/ACT", "Australia/Sydney"}, + {"Australia/LHI", "Australia/Lord_Howe"}, + {"Australia/NSW", "Australia/Sydney"}, + {"Australia/North", "Australia/Darwin"}, + {"Australia/Queensland", "Australia/Brisbane"}, + {"Australia/South", "Australia/Adelaide"}, + {"Australia/Tasmania", "Australia/Hobart"}, + {"Australia/Victoria", "Australia/Melbourne"}, + {"Australia/West", "Australia/Perth"}, + {"Australia/Yancowinna", "Australia/Broken_Hill"}, + {"Brazil/Acre", "America/Rio_Branco"}, + {"Brazil/DeNoronha", "America/Noronha"}, + {"Brazil/East", "America/Sao_Paulo"}, + {"Brazil/West", "America/Manaus"}, + {"Canada/Atlantic", "America/Halifax"}, + {"Canada/Central", "America/Winnipeg"}, + {"Canada/Eastern", "America/Toronto"}, + {"Canada/Mountain", "America/Edmonton"}, + {"Canada/Newfoundland", "America/St_Johns"}, + {"Canada/Pacific", "America/Vancouver"}, + {"Canada/Saskatchewan", "America/Regina"}, + {"Canada/Yukon", "America/Whitehorse"}, + {"Chile/Continental", "America/Santiago"}, + {"Chile/EasterIsland", "Pacific/Easter"}, + {"Cuba", "America/Havana"}, + {"Egypt", "Africa/Cairo"}, + {"Eire", "Europe/Dublin"}, + {"Etc/GMT+0", "Etc/GMT"}, + {"Etc/GMT-0", "Etc/GMT"}, + {"Etc/GMT0", "Etc/GMT"}, + {"Etc/Greenwich", "Etc/GMT"}, + {"Etc/UCT", "Etc/UTC"}, + {"Etc/Universal", "Etc/UTC"}, + {"Etc/Zulu", "Etc/UTC"}, + {"GB", "Europe/London"}, + {"GB-Eire", "Europe/London"}, + {"GMT+0", "Etc/GMT"}, + {"GMT-0", "Etc/GMT"}, + {"GMT0", "Etc/GMT"}, + {"Greenwich", "Etc/GMT"}, + {"Hongkong", "Asia/Hong_Kong"}, + {"Iceland", "Atlantic/Reykjavik"}, + {"Iran", "Asia/Tehran"}, + {"Israel", "Asia/Jerusalem"}, + {"Jamaica", "America/Jamaica"}, + {"Japan", "Asia/Tokyo"}, + {"Kwajalein", "Pacific/Kwajalein"}, + {"Libya", "Africa/Tripoli"}, + {"Mexico/BajaNorte", "America/Tijuana"}, + {"Mexico/BajaSur", "America/Mazatlan"}, + {"Mexico/General", "America/Mexico_City"}, + {"NZ", "Pacific/Auckland"}, + {"NZ-CHAT", "Pacific/Chatham"}, + {"Navajo", "America/Denver"}, + {"PRC", "Asia/Shanghai"}, + {"Poland", "Europe/Warsaw"}, + {"Portugal", "Europe/Lisbon"}, + {"ROC", "Asia/Taipei"}, + {"ROK", "Asia/Seoul"}, + {"Singapore", "Asia/Singapore"}, + {"Turkey", "Europe/Istanbul"}, + {"UCT", "Etc/UTC"}, + {"US/Alaska", "America/Anchorage"}, + {"US/Aleutian", "America/Adak"}, + {"US/Arizona", "America/Phoenix"}, + {"US/Central", "America/Chicago"}, + {"US/East-Indiana", "America/Indiana/Indianapolis"}, + {"US/Eastern", "America/New_York"}, + {"US/Hawaii", "Pacific/Honolulu"}, + {"US/Indiana-Starke", "America/Indiana/Knox"}, + {"US/Michigan", "America/Detroit"}, + {"US/Mountain", "America/Denver"}, + {"US/Pacific", "America/Los_Angeles"}, + {"US/Samoa", "Pacific/Pago_Pago"}, + {"UTC", "Etc/UTC"}, + {"Universal", "Etc/UTC"}, + {"W-SU", "Europe/Moscow"}, + {"Zulu", "Etc/UTC"}, + {"America/Buenos_Aires", "America/Argentina/Buenos_Aires"}, + {"America/Catamarca", "America/Argentina/Catamarca"}, + {"America/Cordoba", "America/Argentina/Cordoba"}, + {"America/Indianapolis", "America/Indiana/Indianapolis"}, + {"America/Jujuy", "America/Argentina/Jujuy"}, + {"America/Knox_IN", "America/Indiana/Knox"}, + {"America/Louisville", "America/Kentucky/Louisville"}, + {"America/Mendoza", "America/Argentina/Mendoza"}, + {"America/Virgin", "America/St_Thomas"}, + {"Pacific/Samoa", "Pacific/Pago_Pago"}, + {"Africa/Timbuktu", "Africa/Bamako"}, + {"America/Argentina/ComodRivadavia", "America/Argentina/Catamarca"}, + {"America/Atka", "America/Adak"}, + {"America/Coral_Harbour", "America/Atikokan"}, + {"America/Ensenada", "America/Tijuana"}, + {"America/Fort_Wayne", "America/Indiana/Indianapolis"}, + {"America/Montreal", "America/Toronto"}, + {"America/Nipigon", "America/Toronto"}, + {"America/Pangnirtung", "America/Iqaluit"}, + {"America/Porto_Acre", "America/Rio_Branco"}, + {"America/Rainy_River", "America/Winnipeg"}, + {"America/Rosario", "America/Argentina/Cordoba"}, + {"America/Santa_Isabel", "America/Tijuana"}, + {"America/Shiprock", "America/Denver"}, + {"America/Thunder_Bay", "America/Toronto"}, + {"America/Yellowknife", "America/Edmonton"}, + {"Antarctica/South_Pole", "Antarctica/McMurdo"}, + {"Asia/Chongqing", "Asia/Shanghai"}, + {"Asia/Harbin", "Asia/Shanghai"}, + {"Asia/Kashgar", "Asia/Urumqi"}, + {"Asia/Tel_Aviv", "Asia/Jerusalem"}, + {"Atlantic/Jan_Mayen", "Europe/Oslo"}, + {"Australia/Canberra", "Australia/Sydney"}, + {"Australia/Currie", "Australia/Hobart"}, + {"Europe/Belfast", "Europe/London"}, + {"Europe/Tiraspol", "Europe/Chisinau"}, + {"Europe/Uzhgorod", "Europe/Kyiv"}, + {"Europe/Zaporozhye", "Europe/Kyiv"}, + {"Pacific/Enderbury", "Pacific/Kanton"}, + {"Pacific/Johnston", "Pacific/Honolulu"}, + {"Pacific/Yap", "Pacific/Chuuk"}, + {"Africa/Asmera", "Africa/Asmara"}, + {"America/Godthab", "America/Nuuk"}, + {"Asia/Ashkhabad", "Asia/Ashgabat"}, + {"Asia/Calcutta", "Asia/Kolkata"}, + {"Asia/Chungking", "Asia/Shanghai"}, + {"Asia/Dacca", "Asia/Dhaka"}, + {"Asia/Istanbul", "Europe/Istanbul"}, + {"Asia/Katmandu", "Asia/Kathmandu"}, + {"Asia/Macao", "Asia/Macau"}, + {"Asia/Rangoon", "Asia/Yangon"}, + {"Asia/Saigon", "Asia/Ho_Chi_Minh"}, + {"Asia/Thimbu", "Asia/Thimphu"}, + {"Asia/Ujung_Pandang", "Asia/Makassar"}, + {"Asia/Ulan_Bator", "Asia/Ulaanbaatar"}, + {"Atlantic/Faeroe", "Atlantic/Faroe"}, + {"Europe/Kiev", "Europe/Kyiv"}, + {"Europe/Nicosia", "Asia/Nicosia"}, + {"Pacific/Ponape", "Pacific/Pohnpei"}, + {"Pacific/Truk", "Pacific/Chuuk"}, + }; + }()); + return *tzLinks; +} + +} // namespace facebook::velox::tz diff --git a/velox/type/tz/gen_timezone_links.py b/velox/type/tz/gen_timezone_links.py new file mode 100755 index 000000000000..6b414b247c01 --- /dev/null +++ b/velox/type/tz/gen_timezone_links.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +from string import Template + +cpp_template = Template( + """\ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is generated. Do not modify it manually. To re-generate it, run: +// +// ./velox/type/tz/gen_timezone_links.py -f /tmp/backward \\ +// > velox/type/tz/TimeZoneLinks.cpp +// +// The backward file should be the same one used in JODA, +// The latest is available here : +// https://github.com/JodaOrg/global-tz/blob/global-tz/backward. +// Presto Java currently uses the one found here: +// https://github.com/JodaOrg/global-tz/blob/2024agtz/backward +// To find the current version in Presto Java, check this file at the version of +// JODA Presto Java is using: +// https://github.com/JodaOrg/joda-time/blob/v2.12.7/src/changes/changes.xml +// @generated + +#include +#include + +namespace facebook::velox::tz { + +const std::unordered_map& getTimeZoneLinks() { + static auto* tzLinks = new std::unordered_map([] { + // Work around clang compiler bug causing multi-hour compilation + // with -fsanitize=fuzzer + // https://github.com/llvm/llvm-project/issues/75666 + return std::unordered_map{ +$entries + }; + }()); + return *tzLinks; +} + +} // namespace facebook::velox::tz\ +""" +) + +entry_template = Template(' {"$tz_key", "$tz_value"},') + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Reads an input file (specified by -f) containing mappings from " + "time zone names to the names that should replace them and prints the " + "corresponding `TimeZoneLinks.cpp` file.", + epilog="(c) Facebook 2004-present", + ) + parser.add_argument( + "-f", + "--file-input", + required=True, + help="Input timezone links file.", + ) + return parser.parse_args() + + +def main(): + args = parse_arguments() + entries = [] + + # Read an input file with links following the format: + # + # > Link Australia/Sydney Australia/NSW + # > Link Australia/Darwin Australia/North + # > Link Australia/Brisbane Australia/Queensland + # > ... + # + # Possible containing comments (everything in a line after # is ignored) + + with open(args.file_input) as file_in: + for line in file_in: + line = line.strip() + + columns = line.split() + if len(columns) == 0: + continue + + if columns[0] != "Link": + continue + + entries.append( + entry_template.substitute(tz_key=columns[2], tz_value=columns[1]) + ) + + print(cpp_template.substitute(entries="\n".join(entries))) + return 0 + + +if __name__ == "__main__": + sys.exit(main())