From 8ab9d11c95fbfd93d6381f95238adceef5109052 Mon Sep 17 00:00:00 2001 From: Pedro Eugenio Rocha Pedreira Date: Wed, 31 Jul 2024 01:17:22 -0700 Subject: [PATCH] Add normalization rules for time zone offsets (#10611) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10611 When resolving time zone offsets, minutes or the ':' separator may be omitted (check inline code comments). Reviewed By: amitkdutta Differential Revision: D60432366 fbshipit-source-id: 71e06fe78ba63148c179c8481ac42d44061c48c6 --- velox/expression/tests/CastExprTest.cpp | 4 +++ velox/type/tests/TimestampConversionTest.cpp | 6 ++++ velox/type/tz/TimeZoneMap.cpp | 33 ++++++++++++++++++++ velox/type/tz/tests/TimeZoneMapTest.cpp | 9 ++++++ 4 files changed, 52 insertions(+) diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index 4f35490847c1..a86bc1f515a9 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -565,6 +565,8 @@ TEST_F(CastExprTest, stringToTimestamp) { "1970-01-01 00:00:00", "2000-01-01 12:21:56", "1970-01-01 00:00:00-02:00", + "1970-01-01 00:00:00 +02", + "1970-01-01 00:00:00 -0101", std::nullopt, }; std::vector> expected{ @@ -574,6 +576,8 @@ TEST_F(CastExprTest, stringToTimestamp) { Timestamp(0, 0), Timestamp(946729316, 0), Timestamp(7200, 0), + Timestamp(-7200, 0), + Timestamp(3660, 0), std::nullopt, }; testCast("timestamp", input, expected); diff --git a/velox/type/tests/TimestampConversionTest.cpp b/velox/type/tests/TimestampConversionTest.cpp index 2785661e1b61..61dfa5007440 100644 --- a/velox/type/tests/TimestampConversionTest.cpp +++ b/velox/type/tests/TimestampConversionTest.cpp @@ -258,6 +258,12 @@ TEST(DateTimeUtilTest, fromTimestampWithTimezoneString) { EXPECT_EQ( parseTimestampWithTimezone("1970-01-01 00:00:00+13:36"), std::make_pair(Timestamp(0, 0), tz::getTimeZoneID("+13:36"))); + EXPECT_EQ( + parseTimestampWithTimezone("1970-01-01 00:00:00 -11"), + std::make_pair(Timestamp(0, 0), tz::getTimeZoneID("-11:00"))); + EXPECT_EQ( + parseTimestampWithTimezone("1970-01-01 00:00:00 +0000"), + std::make_pair(Timestamp(0, 0), tz::getTimeZoneID("+00:00"))); EXPECT_EQ( parseTimestampWithTimezone("1970-01-01 00:00:00Z"), diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index a5c0abf40544..995c50a1f801 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -110,6 +110,10 @@ inline bool startsWith(std::string_view str, const char* prefix) { return str.rfind(prefix, 0) == 0; } +inline bool isTimeZoneOffset(std::string_view str) { + return str.size() >= 3 && (str[0] == '+' || str[0] == '-'); +} + // The timezone parsing logic follows what is defined here: // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones inline bool isUtcEquivalentName(std::string_view zone) { @@ -118,7 +122,36 @@ inline bool isUtcEquivalentName(std::string_view zone) { return utcSet.find(zone) != utcSet.end(); } +// This function tries to apply two normalization rules to time zone offsets: +// +// 1. If the offset only defines the hours portion, assume minutes are zeroed +// out (e.g. "+00" -> "+00:00") +// +// 2. Check if the ':' in between in missing; if so, correct the offset string +// (e.g. "+0000" -> "+00:00"). +// +// This function assumes the first character is either '+' or '-'. +std::string normalizeTimeZoneOffset(const std::string& zoneOffset) { + if (zoneOffset.size() == 3 && isDigit(zoneOffset[1]) && + isDigit(zoneOffset[2])) { + return zoneOffset + ":00"; + } else if ( + zoneOffset.size() == 5 && isDigit(zoneOffset[1]) && + isDigit(zoneOffset[2]) && isDigit(zoneOffset[3]) && + isDigit(zoneOffset[4])) { + return zoneOffset.substr(0, 3) + ':' + zoneOffset.substr(3, 2); + } + return zoneOffset; +} + std::string normalizeTimeZone(const std::string& originalZoneId) { + // If this is an offset that hasn't matched, check if this is an incomplete + // offset. + if (isTimeZoneOffset(originalZoneId)) { + return normalizeTimeZoneOffset(originalZoneId); + } + + // Otherwise, try other time zone name normalizations. std::string_view zoneId = originalZoneId; const bool startsWithEtc = startsWith(zoneId, "etc/"); diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index fcc45426fe2f..c56d6ef9c3ac 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -182,10 +182,19 @@ TEST(TimeZoneMapTest, getTimeZoneID) { // (+/-)XX:MM format. EXPECT_EQ(840, getTimeZoneID("-00:01")); EXPECT_EQ(0, getTimeZoneID("+00:00")); + EXPECT_EQ(0, getTimeZoneID("-00:00")); EXPECT_EQ(454, getTimeZoneID("-06:27")); EXPECT_EQ(541, getTimeZoneID("-05:00")); EXPECT_EQ(1140, getTimeZoneID("+05:00")); + // Incomplete time zone offsets. + EXPECT_EQ(1140, getTimeZoneID("+05")); + EXPECT_EQ(1140, getTimeZoneID("+0500")); + EXPECT_EQ(1150, getTimeZoneID("+0510")); + EXPECT_EQ(181, getTimeZoneID("-1100")); + EXPECT_EQ(181, getTimeZoneID("-11")); + EXPECT_EQ(0, getTimeZoneID("+0000")); + EXPECT_EQ(0, getTimeZoneID("etc/GMT+0")); EXPECT_EQ(0, getTimeZoneID("etc/GMT-0")); EXPECT_EQ(1020, getTimeZoneID("etc/GMT-3"));