diff --git a/velox/docs/functions/presto/conversion.rst b/velox/docs/functions/presto/conversion.rst index 2cce8f53429a..9ef6d0eda6f8 100644 --- a/velox/docs/functions/presto/conversion.rst +++ b/velox/docs/functions/presto/conversion.rst @@ -614,24 +614,51 @@ From VARCHAR ^^^^^^^^^^^^ Casting from a string to timestamp is allowed if the string represents a -timestamp in the format `YYYY-MM-DD` followed by an optional `hh:mm:ssZZ`. -Casting from invalid input values throws. +timestamp in the format `YYYY-MM-DD` followed by an optional `hh:mm:ss.MS`. +Seconds and milliseconds are optional. Casting from invalid input values throws. -Valid examples +Valid examples: :: SELECT cast('1970-01-01' as timestamp); -- 1970-01-01 00:00:00 - SELECT cast('1970-01-01 00:00:00' as timestamp); -- 1970-01-01 00:00:00 + SELECT cast('1970-01-01 00:00:00.123' as timestamp); -- 1970-01-01 00:00:00.123 SELECT cast('1970-01-01 02:01' as timestamp); -- 1970-01-01 02:01:00 SELECT cast('1970-01-01 00:00:00-02:00' as timestamp); -- 1970-01-01 02:00:00 -Invalid example +Invalid example: :: SELECT cast('2012-Oct-23' as timestamp); -- Invalid argument +Optionally, strings may also contain timezone information at the end. Timezone +information may be offsets in the format `+01:00` or `-02:00`, for example, or +timezone names, like `UTC`, `Z`, `America/Los_Angeles` and others, +`as defined here `_. + +For example, these strings contain valid timezone information: + +:: + + SELECT cast('1970-01-01 00:00:00 +09:00' as timestamp); + SELECT cast('1970-01-01 00:00:00 UTC' as timestamp); + SELECT cast('1970-01-01 00:00:00 America/Sao_Paulo' as timestamp); + +If timezone information is specified in the string, the returned timestamp +is adjusted to the corresponding timezone. Otherwise, the timestamp is +assumed to be in the client session timezone, and adjusted accordingly +based on the value of `adjust_timestamp_to_session_timezone`, as described below. + +The space between the hour and timezone definition is optional. + +:: + + SELECT cast('1970-01-01 00:00 Z' as timestamp); + SELECT cast('1970-01-01 00:00Z' as timestamp); + +Are both valid. + From DATE ^^^^^^^^^ diff --git a/velox/expression/CastExpr-inl.h b/velox/expression/CastExpr-inl.h index 258f9638aaa5..b2017b2a738a 100644 --- a/velox/expression/CastExpr-inl.h +++ b/velox/expression/CastExpr-inl.h @@ -727,34 +727,6 @@ void CastExpr::applyCastPrimitives( }); } } - - // If we're converting to a TIMESTAMP, check if we need to adjust the - // current GMT timezone to the user provided session timezone. - if constexpr (ToKind == TypeKind::TIMESTAMP) { - const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig(); - // If user explicitly asked us to adjust the timezone. - if (queryConfig.adjustTimestampToTimezone()) { - auto sessionTzName = queryConfig.sessionTimezone(); - if (!sessionTzName.empty()) { - // When context.throwOnError is false, some rows will be marked as - // 'failed'. These rows should not be processed further. 'remainingRows' - // will contain a subset of 'rows' that have passed all the checks (e.g. - // keys are not nulls and number of keys and values is the same). - exec::LocalSelectivityVector remainingRows(context, rows); - context.deselectErrors(*remainingRows); - - // locate_zone throws runtime_error if the timezone couldn't be found - // (so we're safe to dereference the pointer). - auto* timeZone = date::locate_zone(sessionTzName); - auto rawTimestamps = resultFlatVector->mutableRawValues(); - - applyToSelectedNoThrowLocal( - context, *remainingRows, result, [&](int row) { - rawTimestamps[row].toGMT(*timeZone); - }); - } - } - } } template diff --git a/velox/expression/PrestoCastHooks.cpp b/velox/expression/PrestoCastHooks.cpp index 37d1357cf623..98aa4199860c 100644 --- a/velox/expression/PrestoCastHooks.cpp +++ b/velox/expression/PrestoCastHooks.cpp @@ -33,7 +33,22 @@ PrestoCastHooks::PrestoCastHooks(const core::QueryConfig& config) } Timestamp PrestoCastHooks::castStringToTimestamp(const StringView& view) const { - return util::fromTimestampString(view.data(), view.size()); + auto result = util::fromTimestampWithTimezoneString(view.data(), view.size()); + + // If the parsed string has timezone information, convert the timestamp at + // GMT at that time. For example, "1970-01-01 00:00:00 -00:01" is 60 seconds + // at GMT. + if (result.second != -1) { + result.first.toGMT(result.second); + + } + // If no timezone information is available in the input string, check if we + // should understand it as being at the session timezone, and if so, convert + // to GMT. + else if (options_.timeZone != nullptr) { + result.first.toGMT(*options_.timeZone); + } + return result.first; } int32_t PrestoCastHooks::castStringToDate(const StringView& dateString) const { diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index 223e4678c722..aa0b07ff9fb8 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -574,6 +574,7 @@ TEST_F(CastExprTest, realAndDoubleToString) { TEST_F(CastExprTest, stringToTimestamp) { std::vector> input{ "1970-01-01", + "1970-01-01 00:00 America/Sao_Paulo", "2000-01-01", "1970-01-01 00:00:00", "2000-01-01 12:21:56", @@ -582,6 +583,7 @@ TEST_F(CastExprTest, stringToTimestamp) { }; std::vector> expected{ Timestamp(0, 0), + Timestamp(10800, 0), Timestamp(946684800, 0), Timestamp(0, 0), Timestamp(946729316, 0), @@ -589,6 +591,24 @@ TEST_F(CastExprTest, stringToTimestamp) { std::nullopt, }; testCast("timestamp", input, expected); + + // Try with a different session timezone. + setTimezone("America/Los_Angeles"); + input = { + "1970-01-01 00:00", + "1970-01-01 00:00 +00:00", + "1970-01-01 00:00 +01:00", + "1970-01-01 00:00 America/Sao_Paulo", + "2000-01-01 12:21:56Z", + }; + expected = { + Timestamp(28800, 0), + Timestamp(0, 0), + Timestamp(-3600, 0), + Timestamp(10800, 0), + Timestamp(946729316, 0), + }; + testCast("timestamp", input, expected); } TEST_F(CastExprTest, timestampToString) { @@ -765,7 +785,7 @@ TEST_F(CastExprTest, timestampAdjustToTimezone) { Timestamp(946713600, 0), Timestamp(0, 0), Timestamp(946758116, 0), - Timestamp(-21600, 0), + Timestamp(-50400, 0), std::nullopt, Timestamp(957164400, 0), }); diff --git a/velox/functions/sparksql/tests/SparkCastExprTest.cpp b/velox/functions/sparksql/tests/SparkCastExprTest.cpp index c185b3cba76b..77984e7e7a73 100644 --- a/velox/functions/sparksql/tests/SparkCastExprTest.cpp +++ b/velox/functions/sparksql/tests/SparkCastExprTest.cpp @@ -204,22 +204,20 @@ TEST_F(SparkCastExprTest, stringToTimestamp) { "2000-01-01", "1970-01-01 00:00:00", "2000-01-01 12:21:56", - "1970-01-01 00:00:00-02:00", std::nullopt, "2015-03-18T12:03:17", - "2015-03-18 12:03:17Z", - "2015-03-18T12:03:17Z", + "2015-03-18 12:03:17", + "2015-03-18T12:03:17", "2015-03-18 12:03:17.123", "2015-03-18T12:03:17.123", - "2015-03-18T12:03:17.456Z", - "2015-03-18 12:03:17.456Z", + "2015-03-18T12:03:17.456", + "2015-03-18 12:03:17.456", }; std::vector> expected{ Timestamp(0, 0), Timestamp(946684800, 0), Timestamp(0, 0), Timestamp(946729316, 0), - Timestamp(7200, 0), std::nullopt, Timestamp(1426680197, 0), Timestamp(1426680197, 0), diff --git a/velox/type/TimestampConversion.cpp b/velox/type/TimestampConversion.cpp index 93e65e1779c8..5ed43c347016 100644 --- a/velox/type/TimestampConversion.cpp +++ b/velox/type/TimestampConversion.cpp @@ -43,6 +43,7 @@ #include "velox/type/TimestampConversion.h" #include "velox/common/base/CheckedArithmetic.h" #include "velox/common/base/Exceptions.h" +#include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::util { @@ -147,6 +148,13 @@ inline bool validDate(int64_t daysSinceEpoch) { daysSinceEpoch <= std::numeric_limits::max(); } +// Skip leading spaces. +inline void skipSpaces(const char* buf, size_t len, size_t& pos) { + while (pos < len && characterIsSpace(buf[pos])) { + pos++; + } +} + bool tryParseDateString( const char* buf, size_t len, @@ -163,11 +171,7 @@ bool tryParseDateString( int32_t year = 0; bool yearneg = false; int sep; - - // Skip leading spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); if (pos >= len) { return false; @@ -311,10 +315,8 @@ bool tryParseDateString( // In strict mode, check remaining string for non-space characters. if (mode == ParseMode::kStrict || mode == ParseMode::kNonStandardNoTimeCast) { - // Skip trailing spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); + // Check position. if end was not reached, non-space chars remaining. if (pos < len) { return false; @@ -344,11 +346,7 @@ bool tryParseTimeString( if (len == 0) { return false; } - - // Skip leading spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); if (pos >= len) { return false; @@ -415,10 +413,7 @@ bool tryParseTimeString( // In strict mode, check remaining string for non-space characters. if (strict) { - // Skip trailing spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); // Check position. If end was not reached, non-space chars remaining. if (pos < len) { @@ -429,6 +424,43 @@ bool tryParseTimeString( return true; } +// String format is "YYYY-MM-DD hh:mm:ss.microseconds" (seconds and microseconds +// are optional). ISO 8601 +bool tryParseTimestampString( + const char* buf, + size_t len, + size_t& pos, + Timestamp& result) { + int64_t daysSinceEpoch = 0; + int64_t microsSinceMidnight = 0; + + if (!tryParseDateString( + buf, len, pos, daysSinceEpoch, ParseMode::kNonStrict)) { + return false; + } + + if (pos == len) { + // No time: only a date. + result = fromDatetime(daysSinceEpoch, 0); + return true; + } + + // Try to parse a time field. + if (buf[pos] == ' ' || buf[pos] == 'T') { + pos++; + } + + size_t timePos = 0; + if (!tryParseTimeString( + buf + pos, len - pos, timePos, microsSinceMidnight, false)) { + return false; + } + + pos += timePos; + result = fromDatetime(daysSinceEpoch, microsSinceMidnight); + return true; +} + bool tryParseUTCOffsetString( const char* buf, size_t& pos, @@ -702,55 +734,58 @@ namespace { Timestamp fromTimestampString(const char* str, size_t len) { size_t pos; - int64_t daysSinceEpoch; - int64_t microsSinceMidnight; + Timestamp resultTimestamp; - if (!tryParseDateString( - str, len, pos, daysSinceEpoch, ParseMode::kNonStrict)) { + if (!tryParseTimestampString(str, len, pos, resultTimestamp)) { parserError(str, len); } + skipSpaces(str, len, pos); - if (pos == len) { - // No time: only a date. - return fromDatetime(daysSinceEpoch, 0); + // If not all input was consumed. + if (pos < len) { + parserError(str, len); } + VELOX_CHECK_EQ(pos, len); + return resultTimestamp; +} - // Try to parse a time field. - if (str[pos] == ' ' || str[pos] == 'T') { - pos++; - } +std::pair fromTimestampWithTimezoneString( + const char* str, + size_t len) { + size_t pos; + Timestamp resultTimestamp; - size_t timePos = 0; - if (!tryParseTimeString( - str + pos, len - pos, timePos, microsSinceMidnight, false)) { + if (!tryParseTimestampString(str, len, pos, resultTimestamp)) { parserError(str, len); } - pos += timePos; - auto timestamp = fromDatetime(daysSinceEpoch, microsSinceMidnight); + int64_t timezoneID = -1; + + if (pos < len && characterIsSpace(str[pos])) { + pos++; + } + // If there is anything left to parse, it must be a timezone definition. if (pos < len) { - // Skip a "Z" at the end (as per the ISO 8601 specs). - if (str[pos] == 'Z') { - pos++; + size_t timezonePos = pos; + while (timezonePos < len && !characterIsSpace(str[timezonePos])) { + timezonePos++; } - int hourOffset, minuteOffset; - if (tryParseUTCOffsetString(str, pos, len, hourOffset, minuteOffset)) { - int32_t secondOffset = - (hourOffset * kSecsPerHour) + (minuteOffset * kSecsPerMinute); - timestamp = Timestamp( - timestamp.getSeconds() - secondOffset, timestamp.getNanos()); + std::string_view timezone(str + pos, timezonePos - pos); + + if ((timezoneID = util::getTimeZoneID(timezone, false)) == -1) { + VELOX_USER_FAIL("Unknown timezone value: \"{}\"", timezone); } // Skip any spaces at the end. - while (pos < len && characterIsSpace(str[pos])) { - pos++; - } + pos = timezonePos; + skipSpaces(str, len, pos); + if (pos < len) { parserError(str, len); } } - return timestamp; + return {resultTimestamp, timezoneID}; } } // namespace facebook::velox::util diff --git a/velox/type/TimestampConversion.h b/velox/type/TimestampConversion.h index c6bee1f3fb69..eb41d3d15057 100644 --- a/velox/type/TimestampConversion.h +++ b/velox/type/TimestampConversion.h @@ -156,14 +156,51 @@ inline int64_t fromTimeString(const StringView& str) { // Timestamp conversion -/// Parses a full ISO 8601 timestamp string, following the format -/// "YYYY-MM-DD HH:MM[:SS[.MS]] +00:00" +/// Parses a full ISO 8601 timestamp string, following the format: +/// +/// "YYYY-MM-DD HH:MM[:SS[.MS]]" +/// +/// Seconds and milliseconds are optional. +/// +/// This function does not accept any timezone information in the string (e.g. +/// UTC, Z, or a timezone offsets). This is because the returned timestamp does +/// not contain timezone information; therefore, it would either be required for +/// this function to convert the parsed timestamp (but we don't know the +/// original timezone), or ignore the timezone information, which would be +/// incorecct. +/// +/// For a timezone-aware version of this function, check +/// `fromTimestampWithTimezoneString()` below. Timestamp fromTimestampString(const char* buf, size_t len); inline Timestamp fromTimestampString(const StringView& str) { return fromTimestampString(str.data(), str.size()); } +/// Parses a full ISO 8601 timestamp string, following the format: +/// +/// "YYYY-MM-DD HH:MM[:SS[.MS]] +00:00" +/// +/// This is a timezone-aware version of the function above +/// `fromTimestampString()` which returns both the parsed timestamp and the +/// timezone ID. It is up to the client to do the expected conversion based on +/// these two values. +/// +/// The timezone information at the end of the string may contain a timezone +/// name (as defined in velox/type/tz/*), such as "UTC" or +/// "America/Los_Angeles", or a timezone offset, like "+06:00" or "-09:30". The +/// white space between the hour definition and timestamp is optional. +/// +/// -1 means no timezone information was found. Throws VeloxUserError in case of +/// parsing errors. +std::pair fromTimestampWithTimezoneString( + const char* buf, + size_t len); + +inline auto fromTimestampWithTimezoneString(const StringView& str) { + return fromTimestampWithTimezoneString(str.data(), str.size()); +} + Timestamp fromDatetime(int64_t daysSinceEpoch, int64_t microsSinceMidnight); } // namespace facebook::velox::util diff --git a/velox/type/tests/ConversionsTest.cpp b/velox/type/tests/ConversionsTest.cpp index 84890bcf66a9..5ea162f0f812 100644 --- a/velox/type/tests/ConversionsTest.cpp +++ b/velox/type/tests/ConversionsTest.cpp @@ -1012,14 +1012,14 @@ TEST_F(ConversionsTest, toTimestamp) { "2000-01-01", "1970-01-01 00:00:00", "2000-01-01 12:21:56", - "1970-01-01 00:00:00-02:00", + "1970-01-01 00:01", }, { Timestamp(0, 0), Timestamp(946684800, 0), Timestamp(0, 0), Timestamp(946729316, 0), - Timestamp(7200, 0), + Timestamp(60, 0), }); // Invalid case. diff --git a/velox/type/tests/TimestampConversionTest.cpp b/velox/type/tests/TimestampConversionTest.cpp index 2264299a65f1..bb94963b9772 100644 --- a/velox/type/tests/TimestampConversionTest.cpp +++ b/velox/type/tests/TimestampConversionTest.cpp @@ -279,6 +279,7 @@ TEST(DateTimeUtilTest, fromTimestampString) { EXPECT_EQ(Timestamp(0, 0), fromTimestampString("1970-01-01 00:00")); EXPECT_EQ(Timestamp(0, 0), fromTimestampString("1970-01-01 00:00:00")); + EXPECT_EQ(Timestamp(0, 0), fromTimestampString("1970-01-01 00:00:00 ")); EXPECT_EQ( Timestamp(946729316, 0), fromTimestampString("2000-01-01 12:21:56")); @@ -286,33 +287,77 @@ TEST(DateTimeUtilTest, fromTimestampString) { Timestamp(946729316, 0), fromTimestampString("2000-01-01T12:21:56")); EXPECT_EQ( Timestamp(946729316, 0), fromTimestampString("2000-01-01T 12:21:56")); +} + +TEST(DateTimeUtilTest, fromTimestampStringInvalid) { + const std::string_view parserError = "Unable to parse timestamp value: "; + const std::string_view overflowError = "integer overflow: "; + const std::string_view timezoneError = "Unknown timezone value: "; + + // Needs at least a date. + VELOX_ASSERT_THROW(fromTimestampString(""), parserError); + VELOX_ASSERT_THROW(fromTimestampString("00:00:00"), parserError); + + // Integer overflow during timestamp parsing. + VELOX_ASSERT_THROW( + fromTimestampString("2773581570-01-01 00:00:00-asd"), overflowError); + VELOX_ASSERT_THROW( + fromTimestampString("-2147483648-01-01 00:00:00-asd"), overflowError); + + // Unexpected timezone definition. + VELOX_ASSERT_THROW( + fromTimestampString("1970-01-01 00:00:00 a"), parserError); + VELOX_ASSERT_THROW(fromTimestampString("1970-01-01 00:00:00Z"), parserError); + VELOX_ASSERT_THROW(fromTimestampString("1970-01-01 00:00:00Z"), parserError); + VELOX_ASSERT_THROW( + fromTimestampString("1970-01-01 00:00:00 UTC"), parserError); + VELOX_ASSERT_THROW( + fromTimestampString("1970-01-01 00:00:00 America/Los_Angeles"), + parserError); + + // Parse timestamp with (broken) timezones. + VELOX_ASSERT_THROW( + fromTimestampWithTimezoneString("1970-01-01 00:00:00-asd"), + timezoneError); + VELOX_ASSERT_THROW( + fromTimestampWithTimezoneString("1970-01-01 00:00:00Z UTC"), parserError); + VELOX_ASSERT_THROW( + fromTimestampWithTimezoneString("1970-01-01 00:00:00+00:00:00"), + timezoneError); + + // Can't have multiple spaces. + VELOX_ASSERT_THROW( + fromTimestampWithTimezoneString("1970-01-01 00:00:00 UTC"), + timezoneError); +} - // Test UTC offsets. +TEST(DateTimeUtilTest, fromTimestampWithTimezoneString) { + // -1 means no timezone information. EXPECT_EQ( - Timestamp(7200, 0), fromTimestampString("1970-01-01 00:00:00-02:00")); + fromTimestampWithTimezoneString("1970-01-01 00:00:00"), + std::make_pair(Timestamp(0, 0), -1L)); + + // Test timezone offsets. EXPECT_EQ( - Timestamp(946697400, 0), - fromTimestampString("2000-01-01 00:00:00Z-03:30")); + fromTimestampWithTimezoneString("1970-01-01 00:00:00 -02:00"), + std::make_pair(Timestamp(0, 0), util::getTimeZoneID("-02:00"))); EXPECT_EQ( - Timestamp(1587583417, 0), - fromTimestampString("2020-04-23 04:23:37+09:00")); -} + fromTimestampWithTimezoneString("1970-01-01 00:00:00+13:36"), + std::make_pair(Timestamp(0, 0), util::getTimeZoneID("+13:36"))); -TEST(DateTimeUtilTest, fromTimestampStrInvalid) { - // Needs at least a date. - EXPECT_THROW(fromTimestampString(""), VeloxUserError); - EXPECT_THROW(fromTimestampString("00:00:00"), VeloxUserError); + EXPECT_EQ( + fromTimestampWithTimezoneString("1970-01-01 00:00:00Z"), + std::make_pair(Timestamp(0, 0), util::getTimeZoneID("UTC"))); - // Broken UTC offsets. - EXPECT_THROW(fromTimestampString("1970-01-01 00:00:00-asd"), VeloxUserError); - EXPECT_THROW( - fromTimestampString("1970-01-01 00:00:00+00:00:00"), VeloxUserError); + EXPECT_EQ( + fromTimestampWithTimezoneString("1970-01-01 00:01:00 UTC"), + std::make_pair(Timestamp(60, 0), util::getTimeZoneID("UTC"))); - // Integer overflow during timestamp parsing. - EXPECT_THROW( - fromTimestampString("2773581570-01-01 00:00:00-asd"), VeloxUserError); - EXPECT_THROW( - fromTimestampString("-2147483648-01-01 00:00:00-asd"), VeloxUserError); + EXPECT_EQ( + fromTimestampWithTimezoneString( + "1970-01-01 00:00:01 America/Los_Angeles"), + std::make_pair( + Timestamp(1, 0), util::getTimeZoneID("America/Los_Angeles"))); } TEST(DateTimeUtilTest, toGMT) {