From 89d644666c0e99c003703a91ee1a1137eb65436c Mon Sep 17 00:00:00 2001 From: Bryce Arden Date: Wed, 1 Jun 2022 14:24:47 -0500 Subject: [PATCH 1/4] feat: add ibis-substrait join operator example This patch adds a walkthrough example of converting a simple `join` operator from [Ibis][0] through the [Substrait relalg compute IR][1]. We use the [ibis-substrait][2] compiler to bootstrap a substrait `Extension` that supports some primitive operators (including join) so that we can study the IR. Having an example to study / walkthrough should help us better understand the translation of `substrait` IR into the MLIR ecosystem. Example protobuf text for the substrait join can be found here: https://gist.github.com/bsarden-rivos/280109a64f1f3a2c497d5d2de2f53cd6 [0]: https://github.com/ibis-project/ibis [1]: https://github.com/substrait-io/substrait [2]: https://github.com/ibis-project/ibis-substrait --- python/test/ibis/conftest.py | 10 +++ python/test/ibis/test_ibis_substrait.py | 112 ++++++++++++++++++++++++ requirements.txt | 2 + 3 files changed, 124 insertions(+) create mode 100644 python/test/ibis/conftest.py create mode 100644 python/test/ibis/test_ibis_substrait.py diff --git a/python/test/ibis/conftest.py b/python/test/ibis/conftest.py new file mode 100644 index 0000000..24bb618 --- /dev/null +++ b/python/test/ibis/conftest.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +import pytest + +from ibis_substrait.compiler.core import SubstraitCompiler + + +@pytest.fixture +def compiler(): + return SubstraitCompiler() diff --git a/python/test/ibis/test_ibis_substrait.py b/python/test/ibis/test_ibis_substrait.py new file mode 100644 index 0000000..7ae5859 --- /dev/null +++ b/python/test/ibis/test_ibis_substrait.py @@ -0,0 +1,112 @@ +from collections import OrderedDict + +import ibis +import pytest +from google.protobuf import json_format + +from ibis_substrait.compiler.translate import translate +from ibis_substrait.proto.substrait import type_pb2 as stt +from ibis_substrait.proto.substrait.algebra_pb2 import Expression, Rel + +NULLABILITY_NULLABLE = stt.Type.Nullability.NULLABILITY_NULLABLE +NULLABILITY_REQUIRED = stt.Type.Nullability.NULLABILITY_REQUIRED + +# Test adopted from upstream ibis-substrait unit(s) +# https://github.com/ibis-project/ibis-substrait/blob/main/ibis_substrait/tests/compiler/test_compiler.py + +@pytest.fixture +def t0(): + return ibis.table( + [ + ("full_name", "string"), + ("age", "int64"), + ("ts", "timestamp('UTC')"), + ("delta", "interval"), + ] + ) + +@pytest.fixture +def t1(): + return ibis.table( + [ + ("full_name", "string"), + ("age", "int64"), + ("ts", "timestamp('UTC')"), + ("delta", "interval"), + ] + ) + + +def to_dict(message): + """Print Protobuf message as python dictionary object.""" + return json_format.MessageToDict(message) + + +def test_join(t0, t1, compiler): + """A walkthrough of a join expression in Substrait.""" + expr = ( + t0.left_join(t1, t0.age == t1.age) + ) + result = translate(expr, compiler) + + # This plan is a "volcano" style plan meant for bottoms-up execution. + # As a result, we top-level operation in the relation is the final projection + # https://github.com/substrait-io/substrait/blob/main/proto/substrait/algebra.proto + # + # TODO(bsarden): Find out which logical plan optimizers are used. + assert(result.WhichOneof("rel_type") == "project") + input_: Rel = result.project.input + assert(input_.WhichOneof("rel_type") == "join") + + join: Rel = input_.join + + # The `Expression` message type describes functions / arguments to run on + # the given operator. Each `Expression` defines a Relational Expression Type + # (`rex_type``), which maps to a broad categorization of the underlying + # function category (e.g., `ScalarFunction`, `WindowFunction`, `IfThen`, + # etc.). + join_expr: Expression = join.expression + + # A Join expression maps to a + assert(join_expr.WhichOneof("rex_type") == "scalar_function") + scalar_func = join_expr.scalar_function + + # Each `rex_type` function breaks down into their own `protobuf.Message` type, + # but we will study the `ScalarFunction` as an example, since they are all pretty + # similar. Each `ScalarFunction` maps to: + # 1. A `function_reference`: which represents a "pointer" to a uniquely identifiable + # operator ID that has been [registered][0] with the corresponding `Plan` type. + # These functions are serialized / registered with the `Plan` object through the + # definition of an `Extension` (see link above) and are often referred to as `*_anchor` + # in the specification. + # 2. A list of `FunctionArguments`: these include input type specifications, and can + # also be the result of another `Expression`. + # 3. A `Type` definition for the output: Currently there is only one `output_type` per + # operator that is supported. + # + # [0]: https://github.com/ibis-project/ibis-substrait/blob/main/ibis_substrait/compiler/core.py#L53-L80 + assert len(scalar_func.args) == 2, "a join should always have two operands" + + # We verify that the inputs are `FieldReference` expressions, because we are grabbing + # the `age` column on both tables. To check this, we make sure that both selection ordinal values + # match, since the two tables are equivalent. + arg0, arg1 = scalar_func.args + assert arg0.WhichOneof("rex_type") == "selection" + assert arg1.WhichOneof("rex_type") == "selection" + sel0, sel1 = arg0.selection, arg1.selection + sel0_col_id = sel0.direct_reference.struct_field.field + sel1_col_id = sel1.direct_reference.struct_field.field + assert sel0_col_id == sel1_col_id, "Ordinal values should match!" + + # The left / right sides of the join equate to Table Scan operations + # + # Which contains a struct about field names, dtypes, and whether a field + # is nullable. + left, right = input_.join.left, input_.join.right + assert left.WhichOneof("rel_type") == "read" + assert right.WhichOneof("rel_type") == "read" + + # with open("test_join.pb", "wb") as f: + # f.write(result.SerializeToString()) + js = to_dict(result) + assert js \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0c87ab7..f5f61a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,5 @@ python-dateutil==2.8.2 pytz==2021.3 six==1.16.0 tomli==2.0.1 +ibis-framework==3.0.2 +ibis-substrait==2.7.0 From 7a6bc57b36864f55c27dbd6b217654f0f6bbdfaa Mon Sep 17 00:00:00 2001 From: Bryce Arden Date: Wed, 1 Jun 2022 14:30:27 -0500 Subject: [PATCH 2/4] chore: add venv to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 67fe8eb..278ccbd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ build/ .cache/ __pycache__/ +venv +*.venv From 578b8109a279d285a5088a51a23c759ae0549cc1 Mon Sep 17 00:00:00 2001 From: Bryce Arden Date: Wed, 1 Jun 2022 14:51:50 -0500 Subject: [PATCH 3/4] test(substrait): fixup bad assert We can't safely assert that internal ordinal values of table indices will match, since that is an internal representation. However, we can assert that both column fields *exist*. --- python/test/ibis/test_ibis_substrait.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/test/ibis/test_ibis_substrait.py b/python/test/ibis/test_ibis_substrait.py index 7ae5859..c127850 100644 --- a/python/test/ibis/test_ibis_substrait.py +++ b/python/test/ibis/test_ibis_substrait.py @@ -96,7 +96,8 @@ def test_join(t0, t1, compiler): sel0, sel1 = arg0.selection, arg1.selection sel0_col_id = sel0.direct_reference.struct_field.field sel1_col_id = sel1.direct_reference.struct_field.field - assert sel0_col_id == sel1_col_id, "Ordinal values should match!" + assert sel0_col_id + assert sel1_col_id # The left / right sides of the join equate to Table Scan operations # From bd6eaa153ec9ac2650865ef07be452a72c27641a Mon Sep 17 00:00:00 2001 From: Bryce Arden Date: Wed, 1 Jun 2022 14:54:43 -0500 Subject: [PATCH 4/4] doc: fixup stale comment --- python/test/ibis/test_ibis_substrait.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/test/ibis/test_ibis_substrait.py b/python/test/ibis/test_ibis_substrait.py index c127850..70c540b 100644 --- a/python/test/ibis/test_ibis_substrait.py +++ b/python/test/ibis/test_ibis_substrait.py @@ -66,8 +66,6 @@ def test_join(t0, t1, compiler): # function category (e.g., `ScalarFunction`, `WindowFunction`, `IfThen`, # etc.). join_expr: Expression = join.expression - - # A Join expression maps to a assert(join_expr.WhichOneof("rex_type") == "scalar_function") scalar_func = join_expr.scalar_function