diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 486ea385b6ca..770ac7f0090f 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -139,6 +139,38 @@ These functions support TIMESTAMP and DATE input types. SELECT quarter('2009-07-30'); -- 3 +.. spark:function:: make_timestamp(year, month, day, hour, minute, second[, timezone]) -> timestamp + + Create timestamp from ``year``, ``month``, ``day``, ``hour``, ``minute`` and ``second`` fields. + If the ``timezone`` parameter is provided, + the function interprets the input time components as being in the specified ``timezone``. + Otherwise the function assumes the inputs are in the session's configured time zone. + Requires ``session_timezone`` to be set, or an exceptions will be thrown. + + Arguments: + * year - the year to represent, within the Joda datetime + * month - the month-of-year to represent, from 1 (January) to 12 (December) + * day - the day-of-month to represent, from 1 to 31 + * hour - the hour-of-day to represent, from 0 to 23 + * minute - the minute-of-hour to represent, from 0 to 59 + * second - the second-of-minute and its micro-fraction to represent, from 0 to 60. + The value can be either an integer like 13, or a fraction like 13.123. + The fractional part can have up to 6 digits to represent microseconds. + If the sec argument equals to 60, the seconds field is set + to 0 and 1 minute is added to the final timestamp. + * timezone - the time zone identifier. For example, CET, UTC and etc. + + Returns the timestamp adjusted to the GMT time zone. + Returns NULL for invalid or NULL input. :: + + SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887); -- 2014-12-28 06:30:45.887 + SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887, 'CET'); -- 2014-12-28 05:30:45.887 + SELECT make_timestamp(2019, 6, 30, 23, 59, 60); -- 2019-07-01 00:00:00 + SELECT make_timestamp(2019, 6, 30, 23, 59, 1); -- 2019-06-30 23:59:01 + SELECT make_timestamp(null, 7, 22, 15, 30, 0); -- NULL + SELECT make_timestamp(2014, 12, 28, 6, 30, 60.000001); -- NULL + SELECT make_timestamp(2014, 13, 28, 6, 30, 45.887); -- NULL + .. spark:function:: month(date) -> integer Returns the month of ``date``. :: diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt index 0522cbfefab5..6ba9c03c6c79 100644 --- a/velox/functions/sparksql/CMakeLists.txt +++ b/velox/functions/sparksql/CMakeLists.txt @@ -23,6 +23,7 @@ add_library( Hash.cpp In.cpp LeastGreatest.cpp + MakeTimestamp.cpp Map.cpp RegexFunctions.cpp Register.cpp diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 8a890f764545..3765d4b5ff93 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -14,8 +14,13 @@ * limitations under the License. */ +#pragma once + +#include + #include "velox/functions/lib/DateTimeFormatter.h" #include "velox/functions/lib/TimeUtils.h" +#include "velox/type/TimestampConversion.h" #include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::functions::sparksql { diff --git a/velox/functions/sparksql/MakeTimestamp.cpp b/velox/functions/sparksql/MakeTimestamp.cpp new file mode 100644 index 000000000000..4466482195a1 --- /dev/null +++ b/velox/functions/sparksql/MakeTimestamp.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/expression/DecodedArgs.h" +#include "velox/expression/VectorFunction.h" +#include "velox/type/tz/TimeZoneMap.h" + +namespace facebook::velox::functions::sparksql { +namespace { + +std::optional makeTimeStampFromDecodedArgs( + vector_size_t row, + DecodedVector* yearVector, + DecodedVector* monthVector, + DecodedVector* dayVector, + DecodedVector* hourVector, + DecodedVector* minuteVector, + DecodedVector* microsVector) { + // Check hour. + auto hour = hourVector->valueAt(row); + if (hour < 0 || hour > 24) { + return std::nullopt; + } + // Check minute. + auto minute = minuteVector->valueAt(row); + if (minute < 0 || minute > 60) { + return std::nullopt; + } + // Check microseconds. + auto micros = microsVector->valueAt(row); + if (micros < 0) { + return std::nullopt; + } + auto seconds = micros / util::kMicrosPerSec; + if (seconds > 60 || (seconds == 60 && micros % util::kMicrosPerSec != 0)) { + return std::nullopt; + } + + // Year, month, day will be checked in utils::daysSinceEpochFromDate. + int64_t daysSinceEpoch; + auto status = util::daysSinceEpochFromDate( + yearVector->valueAt(row), + monthVector->valueAt(row), + dayVector->valueAt(row), + daysSinceEpoch); + if (!status.ok()) { + VELOX_DCHECK(status.isUserError()); + return std::nullopt; + } + + // Micros has at most 8 digits (2 for seconds + 6 for microseconds), + // thus it's safe to cast micros from int64_t to int32_t. + auto localMicros = util::fromTime(hour, minute, 0, (int32_t)micros); + return util::fromDatetime(daysSinceEpoch, localMicros); +} + +void setTimestampOrNull( + int32_t row, + std::optional timestamp, + DecodedVector* timeZoneVector, + FlatVector* result) { + if (timestamp.has_value()) { + auto timeZone = timeZoneVector->valueAt(row); + auto tzID = util::getTimeZoneID(std::string_view(timeZone)); + (*timestamp).toGMT(tzID); + result->set(row, *timestamp); + } else { + result->setNull(row, true); + } +} + +void setTimestampOrNull( + int32_t row, + std::optional timestamp, + int64_t tzID, + FlatVector* result) { + if (timestamp.has_value()) { + (*timestamp).toGMT(tzID); + result->set(row, *timestamp); + } else { + result->setNull(row, true); + } +} + +class MakeTimestampFunction : public exec::VectorFunction { + public: + MakeTimestampFunction(int64_t sessionTzID) : sessionTzID_(sessionTzID) {} + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& outputType, + exec::EvalCtx& context, + VectorPtr& result) const override { + context.ensureWritable(rows, TIMESTAMP(), result); + auto* resultFlatVector = result->as>(); + + exec::DecodedArgs decodedArgs(rows, args, context); + auto* year = decodedArgs.at(0); + auto* month = decodedArgs.at(1); + auto* day = decodedArgs.at(2); + auto* hour = decodedArgs.at(3); + auto* minute = decodedArgs.at(4); + auto* micros = decodedArgs.at(5); + + if (args.size() == 7) { + // If the timezone argument is specified, treat the input timestamp as the + // time in that timezone. + if (args[6]->isConstantEncoding()) { + auto tz = + args[6]->asUnchecked>()->valueAt(0); + auto constantTzID = util::getTimeZoneID(std::string_view(tz)); + rows.applyToSelected([&](vector_size_t row) { + auto timestamp = makeTimeStampFromDecodedArgs( + row, year, month, day, hour, minute, micros); + setTimestampOrNull(row, timestamp, constantTzID, resultFlatVector); + }); + } else { + auto* timeZone = decodedArgs.at(6); + rows.applyToSelected([&](vector_size_t row) { + auto timestamp = makeTimeStampFromDecodedArgs( + row, year, month, day, hour, minute, micros); + setTimestampOrNull(row, timestamp, timeZone, resultFlatVector); + }); + } + } else { + // Otherwise use session timezone. + rows.applyToSelected([&](vector_size_t row) { + auto timestamp = makeTimeStampFromDecodedArgs( + row, year, month, day, hour, minute, micros); + setTimestampOrNull(row, timestamp, sessionTzID_, resultFlatVector); + }); + } + } + + static std::vector> signatures() { + return { + exec::FunctionSignatureBuilder() + .integerVariable("precision") + .returnType("timestamp") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("decimal(precision, 6)") + .build(), + exec::FunctionSignatureBuilder() + .integerVariable("precision") + .returnType("timestamp") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("integer") + .argumentType("decimal(precision, 6)") + .argumentType("varchar") + .build(), + }; + } + + private: + const int64_t sessionTzID_; +}; + +std::shared_ptr createMakeTimestampFunction( + const std::string& /* name */, + const std::vector& inputArgs, + const core::QueryConfig& config) { + const auto sessionTzName = config.sessionTimezone(); + VELOX_USER_CHECK( + !sessionTzName.empty(), + "make_timestamp requires session time zone to be set.") + const auto sessionTzID = util::getTimeZoneID(sessionTzName); + + const auto& secondsType = inputArgs[5].type; + VELOX_USER_CHECK( + secondsType->isShortDecimal(), + "Seconds must be short decimal type but got {}", + secondsType->toString()); + auto secondsScale = secondsType->asShortDecimal().scale(); + VELOX_USER_CHECK_EQ( + secondsScale, + 6, + "Seconds fraction must have 6 digits for microseconds but got {}", + secondsScale); + + return std::make_shared(sessionTzID); +} +} // namespace + +VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( + udf_make_timestamp, + MakeTimestampFunction::signatures(), + createMakeTimestampFunction); + +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp index 8c2f02a34d11..4b4f7e8d6160 100644 --- a/velox/functions/sparksql/Register.cpp +++ b/velox/functions/sparksql/Register.cpp @@ -327,6 +327,8 @@ void registerFunctions(const std::string& prefix) { registerFunction({prefix + "second"}); + VELOX_REGISTER_VECTOR_FUNCTION(udf_make_timestamp, prefix + "make_timestamp"); + // Register bloom filter function registerFunction( {prefix + "might_contain"}); diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt index 41283ce1de44..db94c796f764 100644 --- a/velox/functions/sparksql/tests/CMakeLists.txt +++ b/velox/functions/sparksql/tests/CMakeLists.txt @@ -26,6 +26,7 @@ add_executable( DecimalRoundTest.cpp DecimalUtilTest.cpp MakeDecimalTest.cpp + MakeTimestampTest.cpp ElementAtTest.cpp HashTest.cpp InTest.cpp diff --git a/velox/functions/sparksql/tests/MakeTimestampTest.cpp b/velox/functions/sparksql/tests/MakeTimestampTest.cpp new file mode 100644 index 000000000000..e12a8a728b3b --- /dev/null +++ b/velox/functions/sparksql/tests/MakeTimestampTest.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" +#include "velox/type/tz/TimeZoneMap.h" + +namespace facebook::velox::functions::sparksql::test { +namespace { + +class MakeTimestampTest : public SparkFunctionBaseTest { + protected: + void setQueryTimeZone(const std::string& timeZone) { + queryCtx_->testingOverrideConfigUnsafe({ + {core::QueryConfig::kSessionTimezone, timeZone}, + {core::QueryConfig::kAdjustTimestampToTimezone, "true"}, + }); + } +}; + +TEST_F(MakeTimestampTest, basic) { + const auto microsType = DECIMAL(16, 6); + const auto testMakeTimestamp = [&](const RowVectorPtr& data, + const VectorPtr& expected, + bool hasTimeZone) { + auto result = hasTimeZone + ? evaluate("make_timestamp(c0, c1, c2, c3, c4, c5, c6)", data) + : evaluate("make_timestamp(c0, c1, c2, c3, c4, c5)", data); + facebook::velox::test::assertEqualVectors(expected, result); + }; + const auto testConstantTimezone = [&](const RowVectorPtr& data, + const std::string& timezone, + const VectorPtr& expected) { + auto result = evaluate( + fmt::format("make_timestamp(c0, c1, c2, c3, c4, c5, '{}')", timezone), + data); + facebook::velox::test::assertEqualVectors(expected, result); + }; + + // Valid cases w/o time zone argument. + { + const auto year = makeFlatVector({2021, 2021, 2021, 2021, 2021}); + const auto month = makeFlatVector({7, 7, 7, 7, 7}); + const auto day = makeFlatVector({11, 11, 11, 11, 11}); + const auto hour = makeFlatVector({6, 6, 6, 6, 6}); + const auto minute = makeFlatVector({30, 30, 30, 30, 30}); + const auto micros = makeNullableFlatVector( + {45678000, 1e6, 6e7, 59999999, std::nullopt}, microsType); + auto data = makeRowVector({year, month, day, hour, minute, micros}); + + setQueryTimeZone("GMT"); + auto expectedGMT = makeNullableFlatVector( + {util::fromTimestampString("2021-07-11 06:30:45.678"), + util::fromTimestampString("2021-07-11 06:30:01"), + util::fromTimestampString("2021-07-11 06:31:00"), + util::fromTimestampString("2021-07-11 06:30:59.999999"), + std::nullopt}); + testMakeTimestamp(data, expectedGMT, false); + testConstantTimezone(data, "GMT", expectedGMT); + + setQueryTimeZone("Asia/Shanghai"); + auto expectedSessionTimezone = makeNullableFlatVector( + {util::fromTimestampString("2021-07-10 22:30:45.678"), + util::fromTimestampString("2021-07-10 22:30:01"), + util::fromTimestampString("2021-07-10 22:31:00"), + util::fromTimestampString("2021-07-10 22:30:59.999999"), + std::nullopt}); + testMakeTimestamp(data, expectedSessionTimezone, false); + // Session time zone will be ignored if time zone is specified in argument. + testConstantTimezone(data, "GMT", expectedGMT); + } + + // Valid cases w/ time zone argument. + { + setQueryTimeZone("Asia/Shanghai"); + const auto year = makeFlatVector({2021, 2021, 1}); + const auto month = makeFlatVector({07, 07, 1}); + const auto day = makeFlatVector({11, 11, 1}); + const auto hour = makeFlatVector({6, 6, 1}); + const auto minute = makeFlatVector({30, 30, 1}); + const auto micros = + makeNullableFlatVector({45678000, 45678000, 1e6}, microsType); + const auto timeZone = + makeNullableFlatVector({"GMT", "CET", std::nullopt}); + auto data = + makeRowVector({year, month, day, hour, minute, micros, timeZone}); + // Session time zone will be ignored if time zone is specified in argument. + auto expected = makeNullableFlatVector( + {util::fromTimestampString("2021-07-11 06:30:45.678"), + util::fromTimestampString("2021-07-11 04:30:45.678"), + std::nullopt}); + testMakeTimestamp(data, expected, true); + } +} + +TEST_F(MakeTimestampTest, errors) { + const auto microsType = DECIMAL(16, 6); + const auto testInvalidInputs = [&](const RowVectorPtr& data) { + std::vector> nullResults( + data->size(), std::nullopt); + auto expected = makeNullableFlatVector(nullResults); + auto result = evaluate("make_timestamp(c0, c1, c2, c3, c4, c5)", data); + facebook::velox::test::assertEqualVectors(expected, result); + }; + const auto testInvalidSeconds = [&](std::optional microsec) { + auto result = evaluateOnce( + "make_timestamp(c0, c1, c2, c3, c4, c5)", + {1, 1, 1, 1, 1, microsec}, + {INTEGER(), INTEGER(), INTEGER(), INTEGER(), INTEGER(), microsType}); + EXPECT_EQ(result, std::nullopt); + }; + const auto testInvalidArguments = [&](int64_t microsec, + const TypePtr& microsType) { + return evaluateOnce( + "make_timestamp(c0, c1, c2, c3, c4, c5)", + {1, 1, 1, 1, 1, microsec}, + {INTEGER(), INTEGER(), INTEGER(), INTEGER(), INTEGER(), microsType}); + }; + + setQueryTimeZone("Asia/Shanghai"); + // Invalid input returns null. + const auto year = makeFlatVector( + {facebook::velox::util::kMinYear - 1, + facebook::velox::util::kMaxYear + 1, + 1, + 1, + 1, + 1, + 1, + 1}); + const auto month = makeFlatVector({1, 1, 0, 13, 1, 1, 1, 1}); + const auto day = makeFlatVector({1, 1, 1, 1, 0, 32, 1, 1}); + const auto hour = makeFlatVector({1, 1, 1, 1, 1, 1, 25, 1}); + const auto minute = makeFlatVector({1, 1, 1, 1, 1, 1, 1, 61}); + const auto micros = + makeFlatVector({1, 1, 1, 1, 1, 1, 1, 1}, microsType); + auto data = makeRowVector({year, month, day, hour, minute, micros}); + testInvalidInputs(data); + + // Seconds should be either in the range of [0,59], or 60 with zero + // microseconds. + testInvalidSeconds(61e6); + testInvalidSeconds(99999999); + testInvalidSeconds(999999999); + testInvalidSeconds(60007000); + + // Throw if data type for microseconds is invalid. + VELOX_ASSERT_THROW( + testInvalidArguments(1e6, DECIMAL(20, 6)), + "Seconds must be short decimal type but got DECIMAL(20, 6)"); + VELOX_ASSERT_THROW( + testInvalidArguments(1e6, DECIMAL(16, 8)), + "Scalar function signature is not supported: " + "make_timestamp(INTEGER, INTEGER, INTEGER, INTEGER, INTEGER, " + "DECIMAL(16, 8))."); + // Throw if no session time zone. + setQueryTimeZone(""); + VELOX_ASSERT_THROW( + testInvalidArguments(60007000, DECIMAL(16, 6)), + "make_timestamp requires session time zone to be set."); +} + +} // namespace +} // namespace facebook::velox::functions::sparksql::test