From 1ab04667caacebc9c212fbef4f51a245dc960eff Mon Sep 17 00:00:00 2001 From: yangchuan Date: Fri, 19 Jan 2024 05:55:05 -0800 Subject: [PATCH] Add hex Spark function (#8202) Summary: Presto's to_hex only accept varbinary as input type, [doc](https://prestodb.io/docs/current/functions/binary.html#to_hex) Spark's hex accept varbinary, varchar and bigint as input type, [doc](https://spark.apache.org/docs/latest/api/sql/#hex), implementation details: https://github.com/apache/spark/blob/28da1d853477b306774798d8aa738901221fb804/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L1032-L1055 This patch add Spark's hex function support with these input types. Pull Request resolved: https://github.com/facebookincubator/velox/pull/8202 Reviewed By: pedroerp Differential Revision: D52865112 Pulled By: mbasmanova fbshipit-source-id: ae4934ee5de9c5031af0bae317c5e71493a9e53a --- velox/docs/functions/spark/math.rst | 13 ++++ velox/functions/lib/ToHex.h | 72 +++++++++++++++++++ velox/functions/prestosql/BinaryFunctions.h | 27 +------ velox/functions/sparksql/Arithmetic.h | 34 +++++++++ .../functions/sparksql/RegisterArithmetic.cpp | 4 ++ .../sparksql/tests/ArithmeticTest.cpp | 30 ++++++++ 6 files changed, 155 insertions(+), 25 deletions(-) create mode 100644 velox/functions/lib/ToHex.h diff --git a/velox/docs/functions/spark/math.rst b/velox/docs/functions/spark/math.rst index db5da68b3ac8..af05926a75e9 100644 --- a/velox/docs/functions/spark/math.rst +++ b/velox/docs/functions/spark/math.rst @@ -101,6 +101,19 @@ Mathematical Functions Returns ``x`` rounded down to the nearest integer. Supported types are: BIGINT and DOUBLE. +.. spark:function:: hex(x) -> varchar + + Converts ``x`` to hexadecimal. + Supported types are: BIGINT, VARBINARY and VARCHAR. + If the argument is a VARCHAR or VARBINARY, the result is string where each input byte is represented using 2 hex characters. + If the argument is a positive BIGINT, the result is a hex representation of the number (up to 16 characters), + if the argument is a negative BIGINT, the result is a hex representation of the number which will be treated as two's complement. :: + + SELECT hex("Spark SQL"); -- 537061726B2053514C + SELECT hex(17); -- 11 + SELECT hex(-1); -- FFFFFFFFFFFFFFFF + + .. spark:function:: hypot(a, b) -> double Returns the square root of `a` squared plus `b` squared. diff --git a/velox/functions/lib/ToHex.h b/velox/functions/lib/ToHex.h new file mode 100644 index 000000000000..14b714f81157 --- /dev/null +++ b/velox/functions/lib/ToHex.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/expression/StringWriter.h" + +namespace facebook::velox::functions { + +struct ToHexUtil { + FOLLY_ALWAYS_INLINE static void toHex( + StringView input, + exec::StringWriter& result) { + // Lookup table to translate unsigned char to its hexadecimal format. + static const char* const kHexTable = + "000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F" + "202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F" + "404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F" + "606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F" + "808182838485868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F" + "A0A1A2A3A4A5A6A7A8A9AAABACADAEAFB0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF" + "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF" + "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"; + + const int64_t inputSize = input.size(); + const unsigned char* inputBuffer = + reinterpret_cast(input.data()); + result.resize(inputSize * 2); + char* resultBuffer = result.data(); + + for (auto i = 0; i < inputSize; ++i) { + resultBuffer[i * 2] = kHexTable[inputBuffer[i] * 2]; + resultBuffer[i * 2 + 1] = kHexTable[inputBuffer[i] * 2 + 1]; + } + } + + FOLLY_ALWAYS_INLINE static void toHex( + uint64_t input, + exec::StringWriter& result) { + static const char* const kHexTable = "0123456789ABCDEF"; + if (input == 0) { + result = "0"; + return; + } + + const auto resultSize = ((64 - bits::countLeadingZeros(input)) + 3) / 4; + result.resize(resultSize); + char* buffer = result.data(); + + int32_t len = 0; + do { + len += 1; + buffer[resultSize - len] = kHexTable[input & 0xF]; + input >>= 4; + } while (input != 0); + } +}; + +} // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index 35648c892123..1f945a1609de 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -27,6 +27,7 @@ #include "velox/common/encode/Base64.h" #include "velox/external/md5/md5.h" #include "velox/functions/Udf.h" +#include "velox/functions/lib/ToHex.h" namespace facebook::velox::functions { @@ -218,10 +219,6 @@ struct HmacMd5Function { } }; -FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) { - return c < 10 ? (c + '0') : (c + 'A' - 10); -} - template struct ToHexFunction { VELOX_DEFINE_FUNCTION_TYPES(T); @@ -229,27 +226,7 @@ struct ToHexFunction { FOLLY_ALWAYS_INLINE void call( out_type& result, const arg_type& input) { - static const char* const kHexTable = - "000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F" - "202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F" - "404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F" - "606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F" - "808182838485868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F" - "A0A1A2A3A4A5A6A7A8A9AAABACADAEAFB0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF" - "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF" - "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"; - - const auto inputSize = input.size(); - result.resize(inputSize * 2); - - const unsigned char* inputBuffer = - reinterpret_cast(input.data()); - char* resultBuffer = result.data(); - - for (auto i = 0; i < inputSize; ++i) { - resultBuffer[i * 2] = kHexTable[inputBuffer[i] * 2]; - resultBuffer[i * 2 + 1] = kHexTable[inputBuffer[i] * 2 + 1]; - } + ToHexUtil::toHex(input, result); } }; diff --git a/velox/functions/sparksql/Arithmetic.h b/velox/functions/sparksql/Arithmetic.h index 6b1c74f7ce6c..f01153c7ee22 100644 --- a/velox/functions/sparksql/Arithmetic.h +++ b/velox/functions/sparksql/Arithmetic.h @@ -22,6 +22,7 @@ #include #include "velox/functions/Macros.h" +#include "velox/functions/lib/ToHex.h" namespace facebook::velox::functions::sparksql { @@ -318,4 +319,37 @@ struct IsNanFunction { } } }; + +template +struct ToHexVarbinaryFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input) { + ToHexUtil::toHex(input, result); + } +}; + +template +struct ToHexVarcharFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input) { + ToHexUtil::toHex(input, result); + } +}; + +template +struct ToHexBigintFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input) { + ToHexUtil::toHex(input, result); + } +}; } // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/RegisterArithmetic.cpp b/velox/functions/sparksql/RegisterArithmetic.cpp index 08fb54e636fd..5aa85c778878 100644 --- a/velox/functions/sparksql/RegisterArithmetic.cpp +++ b/velox/functions/sparksql/RegisterArithmetic.cpp @@ -66,6 +66,10 @@ void registerArithmeticFunctions(const std::string& prefix) { registerFunction({prefix + "atan2"}); registerFunction({prefix + "log1p"}); registerFunction({prefix + "bin"}); + registerFunction({prefix + "hex"}); + registerFunction({prefix + "hex"}); + registerFunction( + {prefix + "hex"}); registerFunction({prefix + "exp"}); registerBinaryIntegral({prefix + "pmod"}); registerBinaryFloatingPoint({prefix + "pmod"}); diff --git a/velox/functions/sparksql/tests/ArithmeticTest.cpp b/velox/functions/sparksql/tests/ArithmeticTest.cpp index 54317c4505e5..0ed06df554ce 100644 --- a/velox/functions/sparksql/tests/ArithmeticTest.cpp +++ b/velox/functions/sparksql/tests/ArithmeticTest.cpp @@ -417,6 +417,36 @@ TEST_F(ArithmeticTest, isNanDouble) { EXPECT_EQ(false, isNan(std::nullopt)); } +TEST_F(ArithmeticTest, hexWithBigint) { + const auto toHex = [&](std::optional value) { + return evaluateOnce("hex(c0)", value); + }; + EXPECT_EQ("11", toHex(17)); + EXPECT_EQ("FFFFFFFFFFFFFFEF", toHex(-17)); + EXPECT_EQ("0", toHex(0)); + EXPECT_EQ("FFFFFFFFFFFFFFFF", toHex(-1)); + EXPECT_EQ("7FFFFFFFFFFFFFFF", toHex(INT64_MAX)); + EXPECT_EQ("8000000000000000", toHex(INT64_MIN)); +} + +TEST_F(ArithmeticTest, hexWithVarbinaryAndVarchar) { + const auto toHex = [&](std::optional value) { + auto varbinaryResult = + evaluateOnce("hex(cast(c0 as varbinary))", value); + auto varcharResult = evaluateOnce("hex(c0)", value); + + EXPECT_TRUE(varbinaryResult.has_value()); + EXPECT_TRUE(varcharResult.has_value()); + EXPECT_EQ(varbinaryResult.value(), varcharResult.value()); + + return varcharResult.value(); + }; + ASSERT_EQ(toHex(""), ""); + ASSERT_EQ(toHex("Spark SQL"), "537061726B2053514C"); + ASSERT_EQ(toHex("Spark\x65\x21SQL"), "537061726B652153514C"); + ASSERT_EQ(toHex("Spark\u6570\u636ESQL"), "537061726BE695B0E68DAE53514C"); +} + class LogNTest : public SparkFunctionBaseTest { protected: static constexpr float kInf = std::numeric_limits::infinity();