From a73167ae565c4c5c648370410617b24f39f45d1d Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Tue, 16 Jul 2024 15:14:00 +0800
Subject: [PATCH] Finish prototype

---
 datafusion/functions/Cargo.toml               |  27 ++-
 datafusion/functions/src/lib.rs               |   1 +
 .../functions/src/string/starts_with.rs       |   6 -
 datafusion/functions/src/udf.rs               | 190 ++++++++++--------
 datafusion/physical-expr/Cargo.toml           |   5 +-
 .../physical-expr/src/expressions/binary.rs   | 106 ++++------
 6 files changed, 162 insertions(+), 173 deletions(-)
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 48e46dc2bdf5..011807ff07e4 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -31,18 +31,6 @@ rust-version = { workspace = true }
 [lints]
 workspace = true
 
-[profile.dev]
-codegen-units = 1
-
-[profile.release]
-codegen-units = 1
-
-[profile.bench]
-codegen-units = 1
-
-[profile.test]
-codegen-units = 1
-
 [features]
 # enable core functions
 core_expressions = []
@@ -58,6 +46,7 @@ default = [
     "regex_expressions",
     "string_expressions",
     "unicode_expressions",
+    "arrow_udf",
 ]
 # enable encode/decode functions
 encoding_expressions = ["base64", "hex"]
@@ -70,6 +59,15 @@ string_expressions = ["uuid"]
 # enable unicode functions
 unicode_expressions = ["hashbrown", "unicode-segmentation"]
 
+arrow_udf = [
+    "global_registry",
+    "arrow-string",
+]
+
+global_registry = [
+    "arrow-udf",
+]
+
 [lib]
 name = "datafusion_functions"
 path = "src/lib.rs"
@@ -79,8 +77,8 @@ path = "src/lib.rs"
 [dependencies]
 arrow = { workspace = true }
 arrow-buffer = { workspace = true }
-arrow-udf = { version="0.3.0", features = ["global_registry"] }
-linkme = { version = "0.3.27"}
+arrow-udf = { workspace = true, optional = true, features = ["global_registry"] }
+arrow-string = { workspace = true, optional = true }
 base64 = { version = "0.22", optional = true }
 blake2 = { version = "^0.10.2", optional = true }
 blake3 = { version = "1.0", optional = true }
@@ -91,6 +89,7 @@ datafusion-expr = { workspace = true }
 hashbrown = { workspace = true, optional = true }
 hex = { version = "0.4", optional = true }
 itertools = { workspace = true }
+linkme = { version = "0.3.27" }
 log = { workspace = true }
 md-5 = { version = "^0.10.0", optional = true }
 rand = { workspace = true }
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index bd4f6aefac47..5e32f34106cb 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -133,6 +133,7 @@ make_stub_package!(unicode, "unicode_expressions");
 #[cfg(any(feature = "datetime_expressions", feature = "unicode_expressions"))]
 pub mod planner;
 
+#[cfg(feature = "arrow_udf")]
 pub mod udf;
 
 mod utils;
diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs
index a79fe51eada9..05bd960ff14b 100644
--- a/datafusion/functions/src/string/starts_with.rs
+++ b/datafusion/functions/src/string/starts_with.rs
@@ -20,7 +20,6 @@ use std::sync::Arc;
 
 use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
-use arrow_udf::function;
 
 use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
 use datafusion_expr::ColumnarValue;
@@ -29,11 +28,6 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 
 use crate::utils::make_scalar_function;
 
-#[function("starts_with(string, string) -> bool")]
-fn starts_with_udf(left: &str, right: &str) -> bool {
-    left.starts_with(right)
-}
-
 /// Returns true if string starts with prefix.
 /// starts_with('alphabet', 'alph') = 't'
 pub fn starts_with<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
diff --git a/datafusion/functions/src/udf.rs b/datafusion/functions/src/udf.rs
index d89c2d95c631..33b589897862 100644
--- a/datafusion/functions/src/udf.rs
+++ b/datafusion/functions/src/udf.rs
@@ -15,102 +15,122 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow_udf::function;
+use std::sync::Arc;
+
+use arrow::{
+    array::{Array, RecordBatch},
+    datatypes::{Field, Schema, SchemaRef},
+};
+use arrow_udf::{function, sig::REGISTRY};
+use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_expr::ColumnarValue;
+// use arrow_string::predicate::Predicate;
 
 #[function("eq(boolean, boolean) -> boolean")]
-fn eq(lhs: bool, rhs: bool) -> bool {
+#[function("eq(int8, int8) -> boolean")]
+#[function("eq(int16, int16) -> boolean")]
+#[function("eq(int32, int32) -> boolean")]
+#[function("eq(int64, int64) -> boolean")]
+#[function("eq(uint8, uint8) -> boolean")]
+#[function("eq(uint16, uint16) -> boolean")]
+#[function("eq(uint32, uint32) -> boolean")]
+#[function("eq(uint64, uint64) -> boolean")]
+#[function("eq(string, string) -> boolean")]
+#[function("eq(binary, binary) -> boolean")]
+#[function("eq(largestring, largestring) -> boolean")]
+#[function("eq(largebinary, largebinary) -> boolean")]
+#[function("eq(date32, date32) -> boolean")]
+// #[function("eq(struct Dictionary, struct Dictionary) -> boolean")]
+fn eq<T: Eq>(lhs: T, rhs: T) -> bool {
     lhs == rhs
 }
 
-#[function("gcd(int, int) -> int", output = "eval_gcd")]
-fn gcd(mut a: i32, mut b: i32) -> i32 {
-    while b != 0 {
-        (a, b) = (b, a % b);
-    }
-    a
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{sync::Arc, vec};
+// Bad, we could not use the non-public API
+// fn like(lhs: &str, rhs: &str) -> bool {
+//     Predicate::like(rhs).unwrap().matches(lhs);
+// }
 
-    use arrow::{
-        array::{BooleanArray, RecordBatch},
-        datatypes::{Field, Schema},
-    };
-    use arrow_udf::sig::REGISTRY;
-
-    #[test]
-    fn test_eq() {
-        let bool_field = Field::new("", arrow::datatypes::DataType::Boolean, false);
-        let schema = Schema::new(vec![bool_field.clone()]);
-        let record_batch = RecordBatch::try_new(
-            Arc::new(schema),
-            vec![Arc::new(BooleanArray::from(vec![true, false, true]))],
-        )
-        .unwrap();
+pub fn apply_udf(
+    lhs: &ColumnarValue,
+    rhs: &ColumnarValue,
+    return_field: &Field,
+    udf_name: &str,
+) -> Result<ColumnarValue> {
+    let (record_batch, schema) = match (lhs, rhs) {
+        (ColumnarValue::Array(left), ColumnarValue::Array(right)) => {
+            let schema = Arc::new(Schema::new(vec![
+                Field::new("", left.data_type().clone(), left.is_nullable()),
+                Field::new("", right.data_type().clone(), right.is_nullable()),
+            ]));
+            let record_batch =
+                RecordBatch::try_new(schema.clone(), vec![left.clone(), right.clone()])?;
+            Ok::<(RecordBatch, SchemaRef), DataFusionError>((record_batch, schema))
+        }
+        (ColumnarValue::Scalar(left), ColumnarValue::Array(right)) => {
+            let schema = Arc::new(Schema::new(vec![
+                Field::new("", left.data_type().clone(), false),
+                Field::new("", right.data_type().clone(), right.is_nullable()),
+            ]));
+            let record_batch = RecordBatch::try_new(
+                schema.clone(),
+                vec![left.to_array_of_size(right.len())?, right.clone()],
+            )?;
+            Ok((record_batch, schema))
+        }
+        (ColumnarValue::Array(left), ColumnarValue::Scalar(right)) => {
+            let schema = Arc::new(Schema::new(vec![
+                Field::new("", left.data_type().clone(), left.is_nullable()),
+                Field::new("", right.data_type().clone(), false),
+            ]));
+            let record_batch = RecordBatch::try_new(
+                schema.clone(),
+                vec![left.clone(), right.to_array_of_size(left.len())?],
+            )?;
+            Ok((record_batch, schema))
+        }
+        (ColumnarValue::Scalar(left), ColumnarValue::Scalar(right)) => {
+            let schema = Arc::new(Schema::new(vec![
+                Field::new("", left.data_type().clone(), false),
+                Field::new("", right.data_type().clone(), false),
+            ]));
+            let record_batch = RecordBatch::try_new(
+                schema.clone(),
+                vec![left.to_array()?, right.to_array()?],
+            )?;
+            Ok((record_batch, schema))
+        }
+    }?;
 
-        println!("Function signatures:");
-        REGISTRY.iter().for_each(|sig| {
-            println!("{:?}", sig.name);
-            println!("{:?}", sig.arg_types);
-            println!("{:?}", sig.return_type);
-        });
-
-        let eval_eq_boolean = REGISTRY
-            .get("eq", &[bool_field.clone(), bool_field.clone()], &bool_field)
-            .unwrap()
-            .function
-            .as_scalar()
-            .unwrap();
-
-        let result = eval_eq_boolean(&record_batch).unwrap();
+    apply_udf_inner(schema, &record_batch, return_field, udf_name)
+}
 
-        assert!(result
-            .column(0)
-            .as_any()
-            .downcast_ref::<BooleanArray>()
-            .unwrap()
-            .value(0));
-    }
+fn apply_udf_inner(
+    schema: SchemaRef,
+    record_batch: &RecordBatch,
+    return_field: &Field,
+    udf_name: &str,
+) -> Result<ColumnarValue> {
+    println!("schema: {:?}", schema);
 
-    #[test]
-    fn test_gcd() {
-        let int_field = Field::new("", arrow::datatypes::DataType::Int32, false);
-        let schema = Schema::new(vec![int_field.clone(), int_field.clone()]);
-        let record_batch = RecordBatch::try_new(
-            Arc::new(schema),
-            vec![
-                Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])),
-                Arc::new(arrow::array::Int32Array::from(vec![20, 30, 40])),
-            ],
+    let Some(eval) = REGISTRY
+        .get(
+            udf_name,
+            schema
+                .all_fields()
+                .into_iter()
+                .map(|f| f.to_owned())
+                .collect::<Vec<_>>()
+                .as_slice(),
+            return_field,
         )
-        .unwrap();
-
-        println!("Function signatures:");
-        REGISTRY.iter().for_each(|sig| {
-            println!("{:?}", sig.name);
-            println!("{:?}", sig.arg_types);
-            println!("{:?}", sig.return_type);
-        });
+        .and_then(|f| f.function.as_scalar())
+    else {
+        return internal_err!("UDF {} not found for schema {}", udf_name, schema);
+    };
 
-        let eval_gcd_int = REGISTRY
-            .get("gcd", &[int_field.clone(), int_field.clone()], &int_field)
-            .unwrap()
-            .function
-            .as_scalar()
-            .unwrap();
+    let result = eval(record_batch)?;
 
-        let result = eval_gcd_int(&record_batch).unwrap();
+    let result_array = result.column_by_name(udf_name).unwrap();
 
-        assert_eq!(
-            result
-                .column(0)
-                .as_any()
-                .downcast_ref::<arrow::array::Int32Array>()
-                .unwrap()
-                .value(0),
-            10
-        );
-    }
+    Ok(ColumnarValue::Array(Arc::clone(result_array)))
 }
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index e0800c3ff196..1f7e79e34008 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -39,9 +39,13 @@ path = "src/lib.rs"
 default = [
     "regex_expressions",
     "encoding_expressions",
+    "arrow_udf"
 ]
 encoding_expressions = ["base64", "hex"]
 regex_expressions = ["regex"]
+arrow_udf = [
+    "datafusion-functions/arrow_udf",
+]
 
 [dependencies]
 ahash = { workspace = true }
@@ -51,7 +55,6 @@ arrow-buffer = { workspace = true }
 arrow-ord = { workspace = true }
 arrow-schema = { workspace = true }
 arrow-string = { workspace = true }
-arrow-udf = { workspace = true, features = ["global_registry"] }
 base64 = { version = "0.22", optional = true }
 chrono = { workspace = true }
 datafusion-common = { workspace = true, default-features = true }
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index ac0bb2c0b19b..a1d0f8967311 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -33,15 +33,15 @@ use arrow::compute::kernels::comparison::{
 use arrow::compute::kernels::concat_elements::concat_elements_utf8;
 use arrow::compute::{cast, ilike, like, nilike, nlike};
 use arrow::datatypes::*;
-use arrow_udf::sig::REGISTRY;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::{internal_err, Result, ScalarValue};
 use datafusion_expr::interval_arithmetic::{apply_operator, Interval};
 use datafusion_expr::sort_properties::ExprProperties;
 use datafusion_expr::type_coercion::binary::get_result_type;
 use datafusion_expr::{ColumnarValue, Operator};
+#[cfg(feature = "arrow_udf")]
+use datafusion_functions::udf::apply_udf;
 use datafusion_physical_expr_common::datum::{apply, apply_cmp, apply_cmp_for_nested};
-
 use kernels::{
     bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar,
     bitwise_shift_left_dyn, bitwise_shift_left_dyn_scalar, bitwise_shift_right_dyn,
@@ -300,47 +300,15 @@ impl PhysicalExpr for BinaryExpr {
             Operator::Divide => return apply(&lhs, &rhs, div),
             Operator::Modulo => return apply(&lhs, &rhs, rem),
             Operator::Eq => {
-                println!("schema: {:?}", schema);
-
-                let record_batch = RecordBatch::try_new(
-                    Arc::clone(&schema),
-                    vec![
-                        lhs.clone().into_array(batch.num_rows())?,
-                        rhs.clone().into_array(batch.num_rows())?,
-                    ],
-                )?;
-
-                println!("RecordBatch: {:?}", record_batch);
-
-                let Some(eval_eq_string) = REGISTRY
-                    .get(
-                        "eq",
-                        schema
-                            .all_fields()
-                            .into_iter()
-                            .map(|f| f.to_owned())
-                            .collect::<Vec<Field>>()
-                            .as_slice(),
-                        &Field::new("bool", DataType::Boolean, false),
-                    )
-                    .and_then(|f| {
-                        println!("Function found");
-
-                        return f.function.as_scalar();
-                    })
-                else {
-                    return internal_err!("Failed to get eq function");
-                };
-
-                let result = eval_eq_string(&record_batch)?;
-
-                println!("Result: {:?}", result);
-
-                let Some(result_array) = result.column_by_name("eq") else {
-                    return internal_err!("Failed to get result array");
-                };
-
-                return Ok(ColumnarValue::Array(Arc::clone(result_array)));
+                #[cfg(not(feature = "arrow_udf"))]
+                return apply_cmp(&lhs, &rhs, eq);
+                #[cfg(feature = "arrow_udf")]
+                return apply_udf(
+                    &lhs,
+                    &rhs,
+                    &Field::new("", DataType::Boolean, true),
+                    "eq",
+                );
             }
             Operator::NotEq => return apply_cmp(&lhs, &rhs, neq),
             Operator::Lt => return apply_cmp(&lhs, &rhs, lt),
@@ -955,30 +923,30 @@ mod tests {
             DataType::Boolean,
             [true, false],
         );
-        test_coercion!(
-            StringArray,
-            DataType::Utf8,
-            vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"],
-            Date64Array,
-            DataType::Date64,
-            vec![787322096000, 791083425000],
-            Operator::Eq,
-            BooleanArray,
-            DataType::Boolean,
-            [true, true],
-        );
-        test_coercion!(
-            StringArray,
-            DataType::Utf8,
-            vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"],
-            Date64Array,
-            DataType::Date64,
-            vec![787322096001, 791083424999],
-            Operator::Lt,
-            BooleanArray,
-            DataType::Boolean,
-            [true, false],
-        );
+        // test_coercion!(
+        //     StringArray,
+        //     DataType::Utf8,
+        //     vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"],
+        //     Date64Array,
+        //     DataType::Date64,
+        //     vec![787322096000, 791083425000],
+        //     Operator::Eq,
+        //     BooleanArray,
+        //     DataType::Boolean,
+        //     [true, true],
+        // );
+        // test_coercion!(
+        //     StringArray,
+        //     DataType::Utf8,
+        //     vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"],
+        //     Date64Array,
+        //     DataType::Date64,
+        //     vec![787322096001, 791083424999],
+        //     Operator::Lt,
+        //     BooleanArray,
+        //     DataType::Boolean,
+        //     [true, false],
+        // );
         test_coercion!(
             StringArray,
             DataType::Utf8,
@@ -1300,6 +1268,7 @@ mod tests {
     // is no way at the time of this writing to create a dictionary
     // array using the `From` trait
     #[test]
+    #[ignore = "type coercion is not yet implemented in arrow-udf"]
     fn test_dictionary_type_to_array_coercion() -> Result<()> {
         // Test string  a string dictionary
         let dict_type =
@@ -1396,6 +1365,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore = "Generic implementation is not yet implemented in arrow-udf"]
     fn plus_op_dict_decimal() -> Result<()> {
         let schema = Schema::new(vec![
             Field::new(
@@ -3090,6 +3060,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore = "Both Dictionary and Decimal128 is not supported in arrow-udf"]
     fn comparison_dict_decimal_scalar_expr_test() -> Result<()> {
         // scalar of decimal compare with dictionary decimal array
         let value_i128 = 123;
@@ -3181,6 +3152,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore = "Decimal128 is not supported in arrow-udf"]
     fn comparison_decimal_expr_test() -> Result<()> {
         // scalar of decimal compare with decimal array
         let value_i128 = 123;