From ee50d7eb91a72e0fcc2bd4f151bc19bf385c4eb8 Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Wed, 13 Mar 2024 02:27:50 -0700 Subject: [PATCH] Add make_timestamp Spark function (#8812) Summary: Add sparksql function `make_timestamp` for non-ansi behavior, which creates a timestamp from year, month, day, hour, min, sec and timezone (optional) fields. The timezone field indicates the timezone of the input timestamp. If not specified, the input timestamp is treated as the time in the session timezone. The output datatype is timestamp type, which internally stores the number of microseconds from the epoch of `1970-01-01T00:00:00.000000Z (UTC+00:00)` In spark, the result is shown as the time in the session timezone(`spark.sql.session.timeZone`). ``` set spark.sql.session.timeZone=Asia/Shanghai; SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887); -- 2014-12-28 06:30:45.887 SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887, 'CET'); -- 2014-12-28 13:30:45.887 ``` In Velox, it should return the timestamp in UTC timezone, so that the result can be correctly converted by Spark for display. The non-ansi behavior returns NULL for invalid inputs. Spark documentation: https://spark.apache.org/docs/latest/api/sql/#make_timestamp Spark implementation: https://github.com/apache/spark/blob/branch-3.3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala#L2512-L2712 Pull Request resolved: https://github.com/facebookincubator/velox/pull/8812 Reviewed By: amitkdutta Differential Revision: D54788353 Pulled By: mbasmanova fbshipit-source-id: bf28991c4373345876459ab4781eecb90ba30519 --- velox/docs/functions/spark/datetime.rst | 32 +++ velox/functions/sparksql/CMakeLists.txt | 1 + velox/functions/sparksql/DateTimeFunctions.h | 5 + velox/functions/sparksql/MakeTimestamp.cpp | 210 ++++++++++++++++++ velox/functions/sparksql/Register.cpp | 2 + velox/functions/sparksql/tests/CMakeLists.txt | 1 + .../sparksql/tests/MakeTimestampTest.cpp | 177 +++++++++++++++ 7 files changed, 428 insertions(+) create mode 100644 velox/functions/sparksql/MakeTimestamp.cpp create mode 100644 velox/functions/sparksql/tests/MakeTimestampTest.cpp diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 486ea385b6ca..770ac7f0090f 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -139,6 +139,38 @@ These functions support TIMESTAMP and DATE input types. SELECT quarter('2009-07-30'); -- 3 +.. spark:function:: make_timestamp(year, month, day, hour, minute, second[, timezone]) -> timestamp + + Create timestamp from ``year``, ``month``, ``day``, ``hour``, ``minute`` and ``second`` fields. + If the ``timezone`` parameter is provided, + the function interprets the input time components as being in the specified ``timezone``. + Otherwise the function assumes the inputs are in the session's configured time zone. + Requires ``session_timezone`` to be set, or an exceptions will be thrown. + + Arguments: + * year - the year to represent, within the Joda datetime + * month - the month-of-year to represent, from 1 (January) to 12 (December) + * day - the day-of-month to represent, from 1 to 31 + * hour - the hour-of-day to represent, from 0 to 23 + * minute - the minute-of-hour to represent, from 0 to 59 + * second - the second-of-minute and its micro-fraction to represent, from 0 to 60. + The value can be either an integer like 13, or a fraction like 13.123. + The fractional part can have up to 6 digits to represent microseconds. + If the sec argument equals to 60, the seconds field is set + to 0 and 1 minute is added to the final timestamp. + * timezone - the time zone identifier. For example, CET, UTC and etc. + + Returns the timestamp adjusted to the GMT time zone. + Returns NULL for invalid or NULL input. :: + + SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887); -- 2014-12-28 06:30:45.887 + SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887, 'CET'); -- 2014-12-28 05:30:45.887 + SELECT make_timestamp(2019, 6, 30, 23, 59, 60); -- 2019-07-01 00:00:00 + SELECT make_timestamp(2019, 6, 30, 23, 59, 1); -- 2019-06-30 23:59:01 + SELECT make_timestamp(null, 7, 22, 15, 30, 0); -- NULL + SELECT make_timestamp(2014, 12, 28, 6, 30, 60.000001); -- NULL + SELECT make_timestamp(2014, 13, 28, 6, 30, 45.887); -- NULL + .. spark:function:: month(date) -> integer Returns the month of ``date``. :: diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt index 0522cbfefab5..6ba9c03c6c79 100644 --- a/velox/functions/sparksql/CMakeLists.txt +++ b/velox/functions/sparksql/CMakeLists.txt @@ -23,6 +23,7 @@ add_library( Hash.cpp In.cpp LeastGreatest.cpp + MakeTimestamp.cpp Map.cpp RegexFunctions.cpp Register.cpp diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 8a890f764545..3765d4b5ff93 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -14,8 +14,13 @@ * limitations under the License. */ +#pragma once + +#include + #include "velox/functions/lib/DateTimeFormatter.h" #include "velox/functions/lib/TimeUtils.h" +#include "velox/type/TimestampConversion.h" #include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::functions::sparksql { diff --git a/velox/functions/sparksql/MakeTimestamp.cpp b/velox/functions/sparksql/MakeTimestamp.cpp new file mode 100644 index 000000000000..4466482195a1 --- /dev/null +++ b/velox/functions/sparksql/MakeTimestamp.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/expression/DecodedArgs.h" +#include "velox/expression/VectorFunction.h" +#include "velox/type/tz/TimeZoneMap.h" + +namespace facebook::velox::functions::sparksql { +namespace { + +std::optional makeTimeStampFromDecodedArgs( + vector_size_t row, + DecodedVector* yearVector, + DecodedVector* monthVector, + DecodedVector* dayVector, + DecodedVector* hourVector, + DecodedVector* minuteVector, + DecodedVector* microsVector) { + // Check hour. + auto hour = hourVector->valueAt(row); + if (hour < 0 || hour > 24) { + return std::nullopt; + } + // Check minute. + auto minute = minuteVector->valueAt(row); + if (minute < 0 || minute > 60) { + return std::nullopt; + } + // Check microseconds. + auto micros = microsVector->valueAt(row); + if (micros < 0) { + return std::nullopt; + } + auto seconds = micros / util::kMicrosPerSec; + if (seconds > 60 || (seconds == 60 && micros % util::kMicrosPerSec != 0)) { + return std::nullopt; + } + + // Year, month, day will be checked in utils::daysSinceEpochFromDate. + int64_t daysSinceEpoch; + auto status = util::daysSinceEpochFromDate( + yearVector->valueAt(row), + monthVector->valueAt(row), + dayVector->valueAt(row), + daysSinceEpoch); + if (!status.ok()) { + VELOX_DCHECK(status.isUserError()); + return std::nullopt; + } + + // Micros has at most 8 digits (2 for seconds + 6 for microseconds), + // thus it's safe to cast micros from int64_t to int32_t. + auto localMicros = util::fromTime(hour, minute, 0, (int32_t)micros); + return util::fromDatetime(daysSinceEpoch, localMicros); +} + +void setTimestampOrNull( + int32_t row, + std::optional timestamp, + DecodedVector* timeZoneVector, + FlatVector* result) { + if (timestamp.has_value()) { + auto timeZone = timeZoneVector->valueAt(row); + auto tzID = util::getTimeZoneID(std::string_view(timeZone)); + (*timestamp).toGMT(tzID); + result->set(row, *timestamp); + } else { + result->setNull(row, true); + } +} + +void setTimestampOrNull( + int32_t row, + std::optional timestamp, + int64_t tzID, + FlatVector* result) { + if (timestamp.has_value()) { + (*timestamp).toGMT(tzID); + result->set(row, *timestamp); + } else { + result->setNull(row, true); + } +} + +class MakeTimestampFunction : public exec::VectorFunction { + public: + MakeTimestampFunction(int64_t sessionTzID) : sessionTzID_(sessionTzID) {} + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& outputType, + exec::EvalCtx& context, + VectorPtr& result) const override { + context.ensureWritable(rows, TIMESTAMP(), result); + auto* resultFlatVector = result->as>(); + + exec::DecodedArgs decodedArgs(rows, args, context); + auto* year = decodedArgs.at(0); + auto* month = decodedArgs.at(1); + auto* day = decodedArgs.at(2); + auto* hour = decodedArgs.at(3); + auto* minute = decodedArgs.at(4); + auto* micros = decodedArgs.at(5); + + if (args.size() == 7) { + // If the timezone argument is specified, treat the input timestamp as the + // time in that timezone. + if (args[6]->isConstantEncoding()) { + auto tz = + args[6]->asUnchecked>()->valueAt(0); + auto constantTzID = util::getTimeZoneID(std::string_view(tz)); + rows.applyToSelected([&](vector_size_t row) { + auto timestamp = makeTimeStampFromDecodedArgs( + row, year, month, day, hour, minute, micros); + setTimestampOrNull(row, timestamp, constantTzID, resultFlatVector); + }); + } else { + auto* timeZone = decodedArgs.at(6); + rows.applyToSelected([&](vector_size_t row) { + auto timestamp = makeTimeStampFromDecodedArgs( + row, year, month, day, hour, minute, micros); + setTimestampOrNull(row, timestamp, timeZone, resultFlatVector); + }); + } + } else { + // Otherwise use session timezone. + rows.applyToSelected([&](vector_size_t row) { + auto timestamp = makeTimeStampFromDecodedArgs( + row, year, month, day, hour, minute, micros); + setTimestampOrNull(row, timestamp, sessionTzID_, resultFlatVector); + }); + } + } + + static std::vector> signatures() { + return { + exec::FunctionSignatureBuilder() + .integerVariable("precision") + .returnType("timestamp") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("decimal(precision, 6)") + .build(), + exec::FunctionSignatureBuilder() + .integerVariable("precision") + .returnType("timestamp") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("decimal(precision, 6)") + .argumentType("varchar") + .build(), + }; + } + + private: + const int64_t sessionTzID_; +}; + +std::shared_ptr createMakeTimestampFunction( + const std::string& /* name */, + const std::vector& inputArgs, + const core::QueryConfig& config) { + const auto sessionTzName = config.sessionTimezone(); + VELOX_USER_CHECK( + !sessionTzName.empty(), + "make_timestamp requires session time zone to be set.") + const auto sessionTzID = util::getTimeZoneID(sessionTzName); + + const auto& secondsType = inputArgs[5].type; + VELOX_USER_CHECK( + secondsType->isShortDecimal(), + "Seconds must be short decimal type but got {}", + secondsType->toString()); + auto secondsScale = secondsType->asShortDecimal().scale(); + VELOX_USER_CHECK_EQ( + secondsScale, + 6, + "Seconds fraction must have 6 digits for microseconds but got {}", + secondsScale); + + return std::make_shared(sessionTzID); +} +} // namespace + +VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( + udf_make_timestamp, + MakeTimestampFunction::signatures(), + createMakeTimestampFunction); + +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp index 8c2f02a34d11..4b4f7e8d6160 100644 --- a/velox/functions/sparksql/Register.cpp +++ b/velox/functions/sparksql/Register.cpp @@ -327,6 +327,8 @@ void registerFunctions(const std::string& prefix) { registerFunction({prefix + "second"}); + VELOX_REGISTER_VECTOR_FUNCTION(udf_make_timestamp, prefix + "make_timestamp"); + // Register bloom filter function registerFunction( {prefix + "might_contain"}); diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt index 41283ce1de44..db94c796f764 100644 --- a/velox/functions/sparksql/tests/CMakeLists.txt +++ b/velox/functions/sparksql/tests/CMakeLists.txt @@ -26,6 +26,7 @@ add_executable( DecimalRoundTest.cpp DecimalUtilTest.cpp MakeDecimalTest.cpp + MakeTimestampTest.cpp ElementAtTest.cpp HashTest.cpp InTest.cpp diff --git a/velox/functions/sparksql/tests/MakeTimestampTest.cpp b/velox/functions/sparksql/tests/MakeTimestampTest.cpp new file mode 100644 index 000000000000..e12a8a728b3b --- /dev/null +++ b/velox/functions/sparksql/tests/MakeTimestampTest.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" +#include "velox/type/tz/TimeZoneMap.h" + +namespace facebook::velox::functions::sparksql::test { +namespace { + +class MakeTimestampTest : public SparkFunctionBaseTest { + protected: + void setQueryTimeZone(const std::string& timeZone) { + queryCtx_->testingOverrideConfigUnsafe({ + {core::QueryConfig::kSessionTimezone, timeZone}, + {core::QueryConfig::kAdjustTimestampToTimezone, "true"}, + }); + } +}; + +TEST_F(MakeTimestampTest, basic) { + const auto microsType = DECIMAL(16, 6); + const auto testMakeTimestamp = [&](const RowVectorPtr& data, + const VectorPtr& expected, + bool hasTimeZone) { + auto result = hasTimeZone + ? evaluate("make_timestamp(c0, c1, c2, c3, c4, c5, c6)", data) + : evaluate("make_timestamp(c0, c1, c2, c3, c4, c5)", data); + facebook::velox::test::assertEqualVectors(expected, result); + }; + const auto testConstantTimezone = [&](const RowVectorPtr& data, + const std::string& timezone, + const VectorPtr& expected) { + auto result = evaluate( + fmt::format("make_timestamp(c0, c1, c2, c3, c4, c5, '{}')", timezone), + data); + facebook::velox::test::assertEqualVectors(expected, result); + }; + + // Valid cases w/o time zone argument. + { + const auto year = makeFlatVector({2021, 2021, 2021, 2021, 2021}); + const auto month = makeFlatVector({7, 7, 7, 7, 7}); + const auto day = makeFlatVector({11, 11, 11, 11, 11}); + const auto hour = makeFlatVector({6, 6, 6, 6, 6}); + const auto minute = makeFlatVector({30, 30, 30, 30, 30}); + const auto micros = makeNullableFlatVector( + {45678000, 1e6, 6e7, 59999999, std::nullopt}, microsType); + auto data = makeRowVector({year, month, day, hour, minute, micros}); + + setQueryTimeZone("GMT"); + auto expectedGMT = makeNullableFlatVector( + {util::fromTimestampString("2021-07-11 06:30:45.678"), + util::fromTimestampString("2021-07-11 06:30:01"), + util::fromTimestampString("2021-07-11 06:31:00"), + util::fromTimestampString("2021-07-11 06:30:59.999999"), + std::nullopt}); + testMakeTimestamp(data, expectedGMT, false); + testConstantTimezone(data, "GMT", expectedGMT); + + setQueryTimeZone("Asia/Shanghai"); + auto expectedSessionTimezone = makeNullableFlatVector( + {util::fromTimestampString("2021-07-10 22:30:45.678"), + util::fromTimestampString("2021-07-10 22:30:01"), + util::fromTimestampString("2021-07-10 22:31:00"), + util::fromTimestampString("2021-07-10 22:30:59.999999"), + std::nullopt}); + testMakeTimestamp(data, expectedSessionTimezone, false); + // Session time zone will be ignored if time zone is specified in argument. + testConstantTimezone(data, "GMT", expectedGMT); + } + + // Valid cases w/ time zone argument. + { + setQueryTimeZone("Asia/Shanghai"); + const auto year = makeFlatVector({2021, 2021, 1}); + const auto month = makeFlatVector({07, 07, 1}); + const auto day = makeFlatVector({11, 11, 1}); + const auto hour = makeFlatVector({6, 6, 1}); + const auto minute = makeFlatVector({30, 30, 1}); + const auto micros = + makeNullableFlatVector({45678000, 45678000, 1e6}, microsType); + const auto timeZone = + makeNullableFlatVector({"GMT", "CET", std::nullopt}); + auto data = + makeRowVector({year, month, day, hour, minute, micros, timeZone}); + // Session time zone will be ignored if time zone is specified in argument. + auto expected = makeNullableFlatVector( + {util::fromTimestampString("2021-07-11 06:30:45.678"), + util::fromTimestampString("2021-07-11 04:30:45.678"), + std::nullopt}); + testMakeTimestamp(data, expected, true); + } +} + +TEST_F(MakeTimestampTest, errors) { + const auto microsType = DECIMAL(16, 6); + const auto testInvalidInputs = [&](const RowVectorPtr& data) { + std::vector> nullResults( + data->size(), std::nullopt); + auto expected = makeNullableFlatVector(nullResults); + auto result = evaluate("make_timestamp(c0, c1, c2, c3, c4, c5)", data); + facebook::velox::test::assertEqualVectors(expected, result); + }; + const auto testInvalidSeconds = [&](std::optional microsec) { + auto result = evaluateOnce( + "make_timestamp(c0, c1, c2, c3, c4, c5)", + {1, 1, 1, 1, 1, microsec}, + {INTEGER(), INTEGER(), INTEGER(), INTEGER(), INTEGER(), microsType}); + EXPECT_EQ(result, std::nullopt); + }; + const auto testInvalidArguments = [&](int64_t microsec, + const TypePtr& microsType) { + return evaluateOnce( + "make_timestamp(c0, c1, c2, c3, c4, c5)", + {1, 1, 1, 1, 1, microsec}, + {INTEGER(), INTEGER(), INTEGER(), INTEGER(), INTEGER(), microsType}); + }; + + setQueryTimeZone("Asia/Shanghai"); + // Invalid input returns null. + const auto year = makeFlatVector( + {facebook::velox::util::kMinYear - 1, + facebook::velox::util::kMaxYear + 1, + 1, + 1, + 1, + 1, + 1, + 1}); + const auto month = makeFlatVector({1, 1, 0, 13, 1, 1, 1, 1}); + const auto day = makeFlatVector({1, 1, 1, 1, 0, 32, 1, 1}); + const auto hour = makeFlatVector({1, 1, 1, 1, 1, 1, 25, 1}); + const auto minute = makeFlatVector({1, 1, 1, 1, 1, 1, 1, 61}); + const auto micros = + makeFlatVector({1, 1, 1, 1, 1, 1, 1, 1}, microsType); + auto data = makeRowVector({year, month, day, hour, minute, micros}); + testInvalidInputs(data); + + // Seconds should be either in the range of [0,59], or 60 with zero + // microseconds. + testInvalidSeconds(61e6); + testInvalidSeconds(99999999); + testInvalidSeconds(999999999); + testInvalidSeconds(60007000); + + // Throw if data type for microseconds is invalid. + VELOX_ASSERT_THROW( + testInvalidArguments(1e6, DECIMAL(20, 6)), + "Seconds must be short decimal type but got DECIMAL(20, 6)"); + VELOX_ASSERT_THROW( + testInvalidArguments(1e6, DECIMAL(16, 8)), + "Scalar function signature is not supported: " + "make_timestamp(INTEGER, INTEGER, INTEGER, INTEGER, INTEGER, " + "DECIMAL(16, 8))."); + // Throw if no session time zone. + setQueryTimeZone(""); + VELOX_ASSERT_THROW( + testInvalidArguments(60007000, DECIMAL(16, 6)), + "make_timestamp requires session time zone to be set."); +} + +} // namespace +} // namespace facebook::velox::functions::sparksql::test