From d90038daf6fca4d56ccb0f784cfb9b3d03fb2905 Mon Sep 17 00:00:00 2001 From: Daniel Hunte Date: Thu, 17 Oct 2024 10:31:37 -0700 Subject: [PATCH] Clean up timezoneId from DateTimeFormatter (#11259) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11259 We want to capture a tz::TimeZone* instead of relying on timezoneId inside dateTimeFormatter. The former is more flexible and provides conversion capabilities. Reviewed By: kevinwilfong Differential Revision: D64348245 fbshipit-source-id: d10888e4ade9bbff2aea6c2b25800ced41a2daf0 --- velox/functions/lib/DateTimeFormatter.cpp | 93 ++++++++----------- velox/functions/lib/DateTimeFormatter.h | 2 +- .../lib/tests/DateTimeFormatterTest.cpp | 22 ++--- velox/functions/prestosql/DateTimeFunctions.h | 9 +- velox/functions/sparksql/DateTimeFunctions.h | 6 +- .../sparksql/tests/DateTimeFunctionsTest.cpp | 2 +- 6 files changed, 58 insertions(+), 76 deletions(-) diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 4aac4b6c7135..86f40b351948 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -57,7 +57,7 @@ struct Date { int32_t second = 0; int32_t microsecond = 0; bool isAm = true; // AM -> true, PM -> false - int64_t timezoneId = -1; + const tz::TimeZone* timezone = nullptr; bool isClockHour = false; // Whether most recent hour specifier is clockhour bool isHourOfHalfDay = @@ -318,28 +318,29 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) { if (cur < end) { // If there are at least 3 letters left. if (end - cur >= 3) { - static std::unordered_map defaultTzNames{ - {"UTC", 0}, - {"GMT", 0}, - {"EST", tz::getTimeZoneID("America/New_York")}, - {"EDT", tz::getTimeZoneID("America/New_York")}, - {"CST", tz::getTimeZoneID("America/Chicago")}, - {"CDT", tz::getTimeZoneID("America/Chicago")}, - {"MST", tz::getTimeZoneID("America/Denver")}, - {"MDT", tz::getTimeZoneID("America/Denver")}, - {"PST", tz::getTimeZoneID("America/Los_Angeles")}, - {"PDT", tz::getTimeZoneID("America/Los_Angeles")}, - }; + static std::unordered_map + defaultTzNames{ + {"UTC", tz::locateZone("UTC")}, + {"GMT", tz::locateZone("GMT")}, + {"EST", tz::locateZone("America/New_York")}, + {"EDT", tz::locateZone("America/New_York")}, + {"CST", tz::locateZone("America/Chicago")}, + {"CDT", tz::locateZone("America/Chicago")}, + {"MST", tz::locateZone("America/Denver")}, + {"MDT", tz::locateZone("America/Denver")}, + {"PST", tz::locateZone("America/Los_Angeles")}, + {"PDT", tz::locateZone("America/Los_Angeles")}, + }; auto it = defaultTzNames.find(std::string_view(cur, 3)); if (it != defaultTzNames.end()) { - date.timezoneId = it->second; + date.timezone = it->second; return 3; } } // The format 'UT' is also accepted for UTC. else if ((end - cur == 2) && (*cur == 'U') && (*(cur + 1) == 'T')) { - date.timezoneId = 0; + date.timezone = tz::locateZone("UTC"); return 2; } } @@ -358,68 +359,52 @@ int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) { if (*cur == '-' || *cur == '+') { // Long format: "+00:00" if ((end - cur) >= 6 && *(cur + 3) == ':') { - // Fast path for the common case ("+00:00" or "-00:00"), to prevent - // calling getTimeZoneID(), which does a map lookup. - if (std::strncmp(cur + 1, "00:00", 5) == 0) { - date.timezoneId = 0; - } else { - date.timezoneId = tz::getTimeZoneID(std::string_view(cur, 6), false); - if (date.timezoneId == -1) { - return -1; - } + date.timezone = tz::locateZone(std::string_view(cur, 6), false); + if (!date.timezone) { + return -1; } return 6; } // Long format without colon: "+0000" else if ((end - cur) >= 5 && *(cur + 3) != ':') { - // Same fast path described above. - if (std::strncmp(cur + 1, "0000", 4) == 0) { - date.timezoneId = 0; - } else { - // We need to concatenate the 3 first chars with ":" followed by the - // last 2 chars before calling getTimeZoneID, so we use a static - // thread_local buffer to prevent extra allocations. - std::memcpy(&timezoneBuffer[0], cur, 3); - std::memcpy(&timezoneBuffer[4], cur + 3, 2); - date.timezoneId = tz::getTimeZoneID(timezoneBuffer, false); - if (date.timezoneId == -1) { - return -1; - } + // We need to concatenate the 3 first chars with ":" followed by the + // last 2 chars before calling locateZone, so we use a static + // thread_local buffer to prevent extra allocations. + std::memcpy(&timezoneBuffer[0], cur, 3); + std::memcpy(&timezoneBuffer[4], cur + 3, 2); + date.timezone = tz::locateZone(timezoneBuffer, false); + if (!date.timezone) { + return -1; } return 5; } // Short format: "+00" else if ((end - cur) >= 3) { - // Same fast path described above. - if (std::strncmp(cur + 1, "00", 2) == 0) { - date.timezoneId = 0; - } else { - // We need to concatenate the 3 first chars with a trailing ":00" - // before calling getTimeZoneID, so we use a static thread_local - // buffer to prevent extra allocations. - std::memcpy(&timezoneBuffer[0], cur, 3); - std::memcpy(&timezoneBuffer[4], defaultTrailingOffset, 2); - date.timezoneId = tz::getTimeZoneID(timezoneBuffer, false); - if (date.timezoneId == -1) { - return -1; - } + // We need to concatenate the 3 first chars with a trailing ":00" + // before calling getTimeZoneID, so we use a static thread_local + // buffer to prevent extra allocations. + std::memcpy(&timezoneBuffer[0], cur, 3); + std::memcpy(&timezoneBuffer[4], defaultTrailingOffset, 2); + date.timezone = tz::locateZone(timezoneBuffer, false); + if (!date.timezone) { + return -1; } return 3; } } // Single 'Z' character maps to GMT. else if (*cur == 'Z') { - date.timezoneId = 0; + date.timezone = tz::locateZone("GMT"); return 1; } // "UTC", "UCT", "GMT" and "GMT0" are also acceptable by joda. else if ((end - cur) >= 3) { if (std::strncmp(cur, "UTC", 3) == 0 || std::strncmp(cur, "UCT", 3) == 0) { - date.timezoneId = 0; + date.timezone = tz::locateZone("UTC"); return 3; } else if (std::strncmp(cur, "GMT", 3) == 0) { - date.timezoneId = 0; + date.timezone = tz::locateZone("GMT"); if ((end - cur) >= 4 && *(cur + 3) == '0') { return 4; } @@ -1443,7 +1428,7 @@ Expected DateTimeFormatter::parse( util::fromTime(date.hour, date.minute, date.second, date.microsecond); return DateTimeResult{ util::fromDatetime(daysSinceEpoch.value(), microsSinceMidnight), - date.timezoneId}; + date.timezone}; } Expected> buildMysqlDateTimeFormatter( diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index 82fba6037cb1..7a0dc68f8b61 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -154,7 +154,7 @@ struct DateTimeToken { struct DateTimeResult { Timestamp timestamp; - int64_t timezoneId{-1}; + const tz::TimeZone* timezone = nullptr; }; /// A user defined formatter that formats/parses time to/from user provided diff --git a/velox/functions/lib/tests/DateTimeFormatterTest.cpp b/velox/functions/lib/tests/DateTimeFormatterTest.cpp index e57a39b88d97..588ba950c96d 100644 --- a/velox/functions/lib/tests/DateTimeFormatterTest.cpp +++ b/velox/functions/lib/tests/DateTimeFormatterTest.cpp @@ -123,10 +123,10 @@ class DateTimeFormatterTest : public testing::Test { auto dateTimeResultExpected = getJodaDateTimeFormatter(format)->parse(input); auto result = dateTimeResult(dateTimeResultExpected); - if (result.timezoneId == 0) { + if (result.timezone->id() == 0) { return "+00:00"; } - return tz::getTimeZoneName(result.timezoneId); + return result.timezone->name(); } std::string formatMysqlDateTime( @@ -1096,17 +1096,17 @@ TEST_F(JodaDateTimeFormatterTest, parseMixedYMDFormat) { // Include timezone. auto result = parseJoda("2021-11-05+01:00+09:00", "YYYY-MM-dd+HH:mmZZ"); EXPECT_EQ(fromTimestampString("2021-11-05 01:00:00"), result.timestamp); - EXPECT_EQ("+09:00", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("+09:00", result.timezone->name()); // Timezone offset in -hh:mm format. result = parseJoda("-07:232021-11-05+01:00", "ZZYYYY-MM-dd+HH:mm"); EXPECT_EQ(fromTimestampString("2021-11-05 01:00:00"), result.timestamp); - EXPECT_EQ("-07:23", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("-07:23", result.timezone->name()); // Timezone offset in +hhmm format. result = parseJoda("+01332022-03-08+13:00", "ZZYYYY-MM-dd+HH:mm"); EXPECT_EQ(fromTimestampString("2022-03-08 13:00:00"), result.timestamp); - EXPECT_EQ("+01:33", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("+01:33", result.timezone->name()); // Z in the input means GMT in Joda. EXPECT_EQ( @@ -1117,7 +1117,7 @@ TEST_F(JodaDateTimeFormatterTest, parseMixedYMDFormat) { // Timezone in string format. result = parseJoda("2021-11-05+01:00 PST", "YYYY-MM-dd+HH:mm zz"); EXPECT_EQ(fromTimestampString("2021-11-05 01:00:00"), result.timestamp); - EXPECT_EQ("America/Los_Angeles", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("America/Los_Angeles", result.timezone->name()); } TEST_F(JodaDateTimeFormatterTest, parseMixedWeekFormat) { @@ -1239,17 +1239,17 @@ TEST_F(JodaDateTimeFormatterTest, parseMixedWeekFormat) { auto result = parseJoda("2021 22 1 13:29:21.213+09:00", "x w e HH:mm:ss.SSSZZ"); EXPECT_EQ(fromTimestampString("2021-05-31 13:29:21.213"), result.timestamp); - EXPECT_EQ("+09:00", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("+09:00", result.timezone->name()); // Timezone offset in -hh:mm format. result = parseJoda("-07:232021 22 1 13:29:21.213", "ZZx w e HH:mm:ss.SSS"); EXPECT_EQ(fromTimestampString("2021-05-31 13:29:21.213"), result.timestamp); - EXPECT_EQ("-07:23", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("-07:23", result.timezone->name()); // Timezone offset in +hhmm format. result = parseJoda("+01332021 22 1 13:29:21.213", "ZZx w e HH:mm:ss.SSS"); EXPECT_EQ(fromTimestampString("2021-05-31 13:29:21.213"), result.timestamp); - EXPECT_EQ("+01:33", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("+01:33", result.timezone->name()); } TEST_F(JodaDateTimeFormatterTest, parseFractionOfSecond) { @@ -1257,13 +1257,13 @@ TEST_F(JodaDateTimeFormatterTest, parseFractionOfSecond) { auto result = parseJoda("2022-02-23T12:15:00.364+04:00", "yyyy-MM-dd'T'HH:mm:ss.SSSZ"); EXPECT_EQ(fromTimestampString("2022-02-23 12:15:00.364"), result.timestamp); - EXPECT_EQ("+04:00", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("+04:00", result.timezone->name()); // Valid milliseconds and timezone with negative offset. result = parseJoda("2022-02-23T12:15:00.776-14:00", "yyyy-MM-dd'T'HH:mm:ss.SSSZ"); EXPECT_EQ(fromTimestampString("2022-02-23 12:15:00.776"), result.timestamp); - EXPECT_EQ("-14:00", tz::getTimeZoneName(result.timezoneId)); + EXPECT_EQ("-14:00", result.timezone->name()); // Valid milliseconds. EXPECT_EQ( diff --git a/velox/functions/prestosql/DateTimeFunctions.h b/velox/functions/prestosql/DateTimeFunctions.h index 24072830b0a2..877514dac364 100644 --- a/velox/functions/prestosql/DateTimeFunctions.h +++ b/velox/functions/prestosql/DateTimeFunctions.h @@ -1501,7 +1501,7 @@ struct FromIso8601Timestamp { auto [ts, timeZone] = castResult.value(); // Input string may not contain a timezone - if so, it is interpreted in // session timezone. - if (timeZone == nullptr) { + if (!timeZone) { timeZone = sessionTimeZone_; } ts.toGMT(*timeZone); @@ -1687,9 +1687,8 @@ struct ParseDateTimeFunction { // If timezone was not parsed, fallback to the session timezone. If there's // no session timezone, fallback to 0 (GMT). - const auto* timeZone = dateTimeResult->timezoneId != -1 - ? tz::locateZone(dateTimeResult->timezoneId) - : sessionTimeZone_; + const auto* timeZone = + dateTimeResult->timezone ? dateTimeResult->timezone : sessionTimeZone_; dateTimeResult->timestamp.toGMT(*timeZone); result = pack(dateTimeResult->timestamp, timeZone->id()); return Status::OK(); @@ -1819,7 +1818,7 @@ struct AtTimezoneFunction : public TimestampWithTimezoneSupport { const core::QueryConfig& config, const arg_type* /*tsWithTz*/, const arg_type* timezone) { - if (timezone != nullptr) { + if (timezone) { targetTimezoneID_ = tz::getTimeZoneID( std::string_view(timezone->data(), timezone->size())); } diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index c7a0695e23b3..cfcecc4216f6 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -159,8 +159,7 @@ struct UnixTimestampParseFunction { const tz::TimeZone* getTimeZone(const DateTimeResult& result) { // If timezone was not parsed, fallback to the session timezone. - return result.timezoneId != -1 ? tz::locateZone(result.timezoneId) - : sessionTimeZone_; + return result.timezone ? result.timezone : sessionTimeZone_; } // Default if format is not specified, as per Spark documentation. @@ -400,8 +399,7 @@ struct GetTimestampFunction { const tz::TimeZone* getTimeZone(const DateTimeResult& result) const { // If timezone was not parsed, fallback to the session timezone. If there's // no session timezone, fallback to 0 (GMT). - return result.timezoneId != -1 ? tz::locateZone(result.timezoneId) - : sessionTimeZone_; + return result.timezone ? result.timezone : sessionTimeZone_; } std::shared_ptr formatter_{nullptr}; diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp index ca5f9f4fb2c7..44034c92afd5 100644 --- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp @@ -739,7 +739,7 @@ TEST_F(DateTimeFunctionsTest, getTimestamp) { const auto getTimestampString = [&](const std::optional& dateString, const std::string& format) { - return getTimestamp(dateString, format).value().toString(); + return getTimestamp(dateString, format)->toString(); }; EXPECT_EQ(getTimestamp("1970-01-01", "yyyy-MM-dd"), Timestamp(0, 0));