Skip to content

Commit

Permalink
Add reverse(varbinary) Presto function (facebookincubator#8429)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#8429

Presto 'reverse' function allows 3 types of input: array, varchar and varbinary.
Before this change Velox supported array and varchar inputs, but not
varbinary.

When applied to varchar input, the function returns input string with characters
in reversed order. When applies to varbinary input, the function returns input
binary with bytes in reversed order.

Reviewed By: amitkdutta

Differential Revision: D52868694

fbshipit-source-id: 7c0da82c7b61b600e4a112659aa788e67b046042
  • Loading branch information
mbasmanova authored and facebook-github-bot committed Jan 19, 2024
1 parent f68bd2b commit 2e081d1
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 25 deletions.
7 changes: 6 additions & 1 deletion velox/docs/functions/presto/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,12 @@ String Functions
.. function:: reverse(string) -> varchar
:noindex:

Reverses ``string``.
Returns input string with characters in reverse order.

.. function:: reverse(varbinary) -> varbinary
:noindex:

Returns input binary with bytes in reversed order.

.. function:: rpad(string, size, padstring) -> varchar

Expand Down
9 changes: 6 additions & 3 deletions velox/functions/lib/StringEncodingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,23 @@ bool prepareFlatResultsVector(
VectorPtr& result,
const SelectivityVector& rows,
exec::EvalCtx& context,
VectorPtr& argToReuse) {
VectorPtr& argToReuse,
const TypePtr& resultType) {
VELOX_CHECK(resultType->isVarbinary() || resultType->isVarchar())

if (!result && BaseVector::isVectorWritable(argToReuse) &&
argToReuse->isFlatEncoding() &&
hasSingleReferencedBuffers(*argToReuse->asFlatVector<StringView>())) {
// Move input vector to result
VELOX_CHECK(
VectorEncoding::isFlat(argToReuse.get()->encoding()) &&
argToReuse.get()->typeKind() == TypeKind::VARCHAR);
argToReuse.get()->typeKind() == resultType->kind());

result = std::move(argToReuse);
return true;
}
// This will allocate results if not allocated
BaseVector::ensureWritable(rows, VARCHAR(), context.pool(), result);
BaseVector::ensureWritable(rows, resultType, context.pool(), result);

VELOX_CHECK(VectorEncoding::isFlat(result->encoding()));
return false;
Expand Down
5 changes: 4 additions & 1 deletion velox/functions/lib/StringEncodingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@ namespace facebook::velox::functions {
/// Helper function that prepares a string result vector and initializes it.
/// It will use the input argToReuse vector instead of creating new one when
/// possible. Returns true if argToReuse vector was moved to results
///
/// @param resultType VARCHAR() or VARBINARY().
bool prepareFlatResultsVector(
VectorPtr& result,
const SelectivityVector& rows,
exec::EvalCtx& context,
VectorPtr& argToReuse);
VectorPtr& argToReuse,
const TypePtr& resultType = VARCHAR());

/// Return the string encoding of a vector, if not set UTF8 is returned
static bool isAscii(BaseVector* vector, const SelectivityVector& rows) {
Expand Down
60 changes: 40 additions & 20 deletions velox/functions/prestosql/Reverse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ namespace facebook::velox::functions {
/// Takes any array as an input and returns the reversed array.
///
/// reverse(Varchar) -> Varchar
/// Takes any Varchar as an input and returns the reversed varchar.
/// Takes any string as an input and returns a string with characters in
/// reverse order.
///
/// reverse(Varbinary) -> Varbinary
/// Takes any binary as an input and returns a binary with bytes in reverse
/// order.
class ReverseFunction : public exec::VectorFunction {
private:
/// String encoding wrappable function
Expand All @@ -35,9 +40,9 @@ class ReverseFunction : public exec::VectorFunction {
static void apply(
const SelectivityVector& rows,
const FlatVector<StringView>* input,
FlatVector<StringView>* results) {
FlatVector<StringView>* result) {
rows.applyToSelected([&](int row) {
auto proxy = exec::StringWriter<>(results, row);
auto proxy = exec::StringWriter<>(result, row);
stringImpl::reverse<isAscii>(proxy, input->valueAt(row).getString());
proxy.finalize();
});
Expand All @@ -53,38 +58,50 @@ class ReverseFunction : public exec::VectorFunction {
VectorPtr& result) const override {
VELOX_CHECK_EQ(args.size(), 1);

auto& arg = args[0];

switch (args[0]->typeKind()) {
case TypeKind::ARRAY:
applyArray(rows, args, context, result);
applyArray(rows, arg, context, result);
return;
case TypeKind::VARCHAR:
applyVarchar(rows, args, context, result);
case TypeKind::VARCHAR: {
const auto ascii = isAscii(arg.get(), rows);
applyVarchar(rows, arg, ascii, context, result);
return;
}
case TypeKind::VARBINARY:
// The only difference betwen VARCHAR and VARBINARY input is that
// VARBINARY is reversed byte-by-byte, while VARCHAR is reversed
// character-by-character. Hence, VARINARY behavior is the same as
// VARCHAR with ascii flag set to true.
applyVarchar(rows, arg, true /*isAscii*/, context, result);
return;
default:
VELOX_FAIL(
"Unsupported input type for 'reverse' function: {}",
args[0]->type()->toString());
arg->type()->toString());
}
}

void applyVarchar(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
VectorPtr& arg,
bool isAscii,
exec::EvalCtx& context,
VectorPtr& result) const {
auto* arg = args[0].get();

auto ascii = isAscii(arg, rows);
// Capture the pointer to input argument. prepareFlatResultsVector may move
// it into result.
auto* originalArg = arg.get();

prepareFlatResultsVector(result, rows, context, args[0]);
prepareFlatResultsVector(result, rows, context, arg, arg->type());
auto* flatResult = result->as<FlatVector<StringView>>();

// Input can be constant or flat.
if (arg->isConstantEncoding()) {
auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
if (originalArg->isConstantEncoding()) {
auto value = originalArg->as<ConstantVector<StringView>>()->valueAt(0);

auto proxy = exec::StringWriter<>(flatResult, rows.begin());
if (ascii) {
if (isAscii) {
stringImpl::reverse<true>(proxy, value.str());
} else {
stringImpl::reverse<false>(proxy, value.str());
Expand All @@ -96,10 +113,10 @@ class ReverseFunction : public exec::VectorFunction {

rows.applyToSelected([&](auto row) { rawResults[row] = reversedValue; });
} else {
auto flatInput = arg->as<FlatVector<StringView>>();
auto flatInput = originalArg->as<FlatVector<StringView>>();

StringEncodingTemplateWrapper<ApplyVarcharInternal>::apply(
ascii, rows, flatInput, flatResult);
isAscii, rows, flatInput, flatResult);
}
}

Expand All @@ -113,11 +130,9 @@ class ReverseFunction : public exec::VectorFunction {

void applyArray(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
VectorPtr& arg,
exec::EvalCtx& context,
VectorPtr& result) const {
auto& arg = args[0];

VectorPtr localResult;

// Input can be constant or flat.
Expand Down Expand Up @@ -189,6 +204,11 @@ class ReverseFunction : public exec::VectorFunction {
.returnType("varchar")
.argumentType("varchar")
.build(),
// varbinary -> varbinary
exec::FunctionSignatureBuilder()
.returnType("varbinary")
.argumentType("varbinary")
.build(),
};
}
};
Expand Down
21 changes: 21 additions & 0 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1415,6 +1415,27 @@ TEST_F(StringFunctionsTest, reverse) {
EXPECT_EQ(reverse(invalidIncompleteString), "\xa0\xed");
}

TEST_F(StringFunctionsTest, varbinaryReverse) {
// Reversing binary string with multi-byte unicode characters doesn't preserve
// the characters.
auto input =
makeFlatVector<std::string>({"hi", "", "\u4FE1 \u7231"}, VARBINARY());

// \u4FE1 character is 3 bytes: \xE4\xBF\xA1
// \u7231 character is 3 bytes: \xE7\x88\xB1
auto expected = makeFlatVector<std::string>(
{"ih", "", "\xB1\x88\xE7 \xA1\xBF\xE4"}, VARBINARY());
auto result = evaluate("reverse(c0)", makeRowVector({input}));
test::assertEqualVectors(expected, result);

// Reversing same string as varchar preserves the characters.
input = makeFlatVector<std::string>({"hi", "", "\u4FE1 \u7231"}, VARCHAR());
expected = makeFlatVector<std::string>(
{"ih", "", "\xE7\x88\xB1 \xE4\xBF\xA1"}, VARCHAR());
result = evaluate("reverse(c0)", makeRowVector({input}));
test::assertEqualVectors(expected, result);
}

TEST_F(StringFunctionsTest, toUtf8) {
const auto toUtf8 = [&](std::optional<std::string> value) {
return evaluateOnce<std::string>("to_utf8(c0)", value);
Expand Down

0 comments on commit 2e081d1

Please sign in to comment.