From 2b4e2003e15d372b7e6aa414e768c44cab5b8e2e Mon Sep 17 00:00:00 2001 From: Ian Lai <108986288+Chen-Yuan-Lai@users.noreply.github.com> Date: Mon, 30 Dec 2024 03:25:33 +0800 Subject: [PATCH 1/4] doc-gen: migrate scalar functions (array) documentation 3/3 (#13930) * doc-gen: migrate scalar functions (array) documentation 3/3 * fix: import doc and macro, fix typo and update function docs --------- Co-authored-by: Cheng-Yuan-Lai --- datafusion/functions-nested/src/range.rs | 144 +++++++-------- datafusion/functions-nested/src/remove.rs | 167 +++++++---------- datafusion/functions-nested/src/repeat.rs | 71 ++++---- datafusion/functions-nested/src/replace.rs | 169 +++++++----------- datafusion/functions-nested/src/resize.rs | 63 +++---- datafusion/functions-nested/src/reverse.rs | 50 +++--- datafusion/functions-nested/src/set_ops.rs | 162 +++++++---------- datafusion/functions-nested/src/sort.rs | 66 +++---- datafusion/functions-nested/src/string.rs | 133 ++++++-------- .../source/user-guide/sql/scalar_functions.md | 123 +++++++------ 10 files changed, 486 insertions(+), 662 deletions(-) diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs index 8344c1a261db..cf27c70c2ba8 100644 --- a/datafusion/functions-nested/src/range.rs +++ b/datafusion/functions-nested/src/range.rs @@ -37,16 +37,16 @@ use datafusion_common::cast::{ use datafusion_common::{ exec_datafusion_err, exec_err, internal_err, not_impl_datafusion_err, Result, }; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use itertools::Itertools; use std::any::Any; use std::cmp::Ordering; use std::iter::from_fn; use std::str::FromStr; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( Range, @@ -55,6 +55,39 @@ make_udf_expr_and_func!( "create a list of values in the range between start and stop", range_udf ); + +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0.", + syntax_example = "range(start, stop, step)", + sql_example = r#"```sql +> select range(2, 10, 3); ++-----------------------------------+ +| range(Int64(2),Int64(10),Int64(3))| ++-----------------------------------+ +| [2, 5, 8] | ++-----------------------------------+ + +> select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH); ++--------------------------------------------------------------+ +| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | ++--------------------------------------------------------------+ +| [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] | ++--------------------------------------------------------------+ +```"#, + argument( + name = "start", + description = "Start of the range. Ints, timestamps, dates or string types that can be coerced to Date32 are supported." + ), + argument( + name = "end", + description = "End of the range (not included). Type must be the same as start." + ), + argument( + name = "step", + description = "Increase by step (cannot be 0). Steps less than a day are supported only for timestamp ranges." + ) +)] #[derive(Debug)] pub(super) struct Range { signature: Signature, @@ -141,52 +174,10 @@ impl ScalarUDFImpl for Range { } fn documentation(&self) -> Option<&Documentation> { - Some(get_range_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_range_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0.", - - "range(start, stop, step)") - .with_sql_example( - r#"```sql -> select range(2, 10, 3); -+-----------------------------------+ -| range(Int64(2),Int64(10),Int64(3))| -+-----------------------------------+ -| [2, 5, 8] | -+-----------------------------------+ - -> select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH); -+--------------------------------------------------------------+ -| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | -+--------------------------------------------------------------+ -| [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] | -+--------------------------------------------------------------+ -```"#, - ) - .with_argument( - "start", - "Start of the range. Ints, timestamps, dates or string types that can be coerced to Date32 are supported.", - ) - .with_argument( - "end", - "End of the range (not included). Type must be the same as start.", - ) - .with_argument( - "step", - "Increase by step (cannot be 0). Steps less than a day are supported only for timestamp ranges.", - ) - .build() - }) -} - make_udf_expr_and_func!( GenSeries, gen_series, @@ -194,6 +185,32 @@ make_udf_expr_and_func!( "create a list of values in the range between start and stop, include upper bound", gen_series_udf ); + +#[user_doc( + doc_section(label = "Array Functions"), + description = "Similar to the range function, but it includes the upper bound.", + syntax_example = "generate_series(start, stop, step)", + sql_example = r#"```sql +> select generate_series(1,3); ++------------------------------------+ +| generate_series(Int64(1),Int64(3)) | ++------------------------------------+ +| [1, 2, 3] | ++------------------------------------+ +```"#, + argument( + name = "start", + description = "Start of the series. Ints, timestamps, dates or string types that can be coerced to Date32 are supported." + ), + argument( + name = "end", + description = "End of the series (included). Type must be the same as start." + ), + argument( + name = "step", + description = "Increase by step (can not be 0). Steps less than a day are supported only for timestamp ranges." + ) +)] #[derive(Debug)] pub(super) struct GenSeries { signature: Signature, @@ -283,45 +300,10 @@ impl ScalarUDFImpl for GenSeries { } fn documentation(&self) -> Option<&Documentation> { - Some(get_generate_series_doc()) + self.doc() } } -static GENERATE_SERIES_DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_generate_series_doc() -> &'static Documentation { - GENERATE_SERIES_DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Similar to the range function, but it includes the upper bound.", - - "generate_series(start, stop, step)") - .with_sql_example( - r#"```sql -> select generate_series(1,3); -+------------------------------------+ -| generate_series(Int64(1),Int64(3)) | -+------------------------------------+ -| [1, 2, 3] | -+------------------------------------+ -```"#, - ) - .with_argument( - "start", - "start of the series. Ints, timestamps, dates or string types that can be coerced to Date32 are supported.", - ) - .with_argument( - "end", - "end of the series (included). Type must be the same as start.", - ) - .with_argument( - "step", - "increase by step (can not be 0). Steps less than a day are supported only for timestamp ranges.", - ) - .build() - }) -} - /// Generates an array of integers from start to stop with a given step. /// /// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values. diff --git a/datafusion/functions-nested/src/remove.rs b/datafusion/functions-nested/src/remove.rs index e5521706bece..b6031ce733f1 100644 --- a/datafusion/functions-nested/src/remove.rs +++ b/datafusion/functions-nested/src/remove.rs @@ -27,12 +27,12 @@ use arrow_buffer::OffsetBuffer; use arrow_schema::{DataType, Field}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( ArrayRemove, @@ -42,6 +42,27 @@ make_udf_expr_and_func!( array_remove_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Removes the first element from the array equal to the given value.", + syntax_example = "array_remove(array, element)", + sql_example = r#"```sql +> select array_remove([1, 2, 2, 3, 2, 1, 4], 2); ++----------------------------------------------+ +| array_remove(List([1,2,2,3,2,1,4]),Int64(2)) | ++----------------------------------------------+ +| [1, 2, 3, 2, 1, 4] | ++----------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "element", + description = "Element to be removed from the array." + ) +)] #[derive(Debug)] pub(super) struct ArrayRemove { signature: Signature, @@ -87,41 +108,10 @@ impl ScalarUDFImpl for ArrayRemove { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_remove_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_remove_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Removes the first element from the array equal to the given value.", - - "array_remove(array, element)") - .with_sql_example( - r#"```sql -> select array_remove([1, 2, 2, 3, 2, 1, 4], 2); -+----------------------------------------------+ -| array_remove(List([1,2,2,3,2,1,4]),Int64(2)) | -+----------------------------------------------+ -| [1, 2, 3, 2, 1, 4] | -+----------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "element", - "Element to be removed from the array.", - ) - .build() - }) -} - make_udf_expr_and_func!( ArrayRemoveN, array_remove_n, @@ -130,6 +120,28 @@ make_udf_expr_and_func!( array_remove_n_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Removes the first `max` elements from the array equal to the given value.", + syntax_example = "array_remove_n(array, element, max))", + sql_example = r#"```sql +> select array_remove_n([1, 2, 2, 3, 2, 1, 4], 2, 2); ++---------------------------------------------------------+ +| array_remove_n(List([1,2,2,3,2,1,4]),Int64(2),Int64(2)) | ++---------------------------------------------------------+ +| [1, 3, 2, 1, 4] | ++---------------------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "element", + description = "Element to be removed from the array." + ), + argument(name = "max", description = "Number of first occurrences to remove.") +)] #[derive(Debug)] pub(super) struct ArrayRemoveN { signature: Signature, @@ -175,43 +187,10 @@ impl ScalarUDFImpl for ArrayRemoveN { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_remove_n_doc()) + self.doc() } } -fn get_array_remove_n_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Removes the first `max` elements from the array equal to the given value.", - - "array_remove_n(array, element, max)") - .with_sql_example( - r#"```sql -> select array_remove_n([1, 2, 2, 3, 2, 1, 4], 2, 2); -+---------------------------------------------------------+ -| array_remove_n(List([1,2,2,3,2,1,4]),Int64(2),Int64(2)) | -+---------------------------------------------------------+ -| [1, 3, 2, 1, 4] | -+---------------------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "element", - "Element to be removed from the array.", - ) - .with_argument( - "max", - "Number of first occurrences to remove.", - ) - .build() - }) -} - make_udf_expr_and_func!( ArrayRemoveAll, array_remove_all, @@ -220,6 +199,27 @@ make_udf_expr_and_func!( array_remove_all_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Removes all elements from the array equal to the given value.", + syntax_example = "array_remove_all(array, element)", + sql_example = r#"```sql +> select array_remove_all([1, 2, 2, 3, 2, 1, 4], 2); ++--------------------------------------------------+ +| array_remove_all(List([1,2,2,3,2,1,4]),Int64(2)) | ++--------------------------------------------------+ +| [1, 3, 1, 4] | ++--------------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "element", + description = "Element to be removed from the array." + ) +)] #[derive(Debug)] pub(super) struct ArrayRemoveAll { signature: Signature, @@ -265,39 +265,10 @@ impl ScalarUDFImpl for ArrayRemoveAll { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_remove_all_doc()) + self.doc() } } -fn get_array_remove_all_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Removes all elements from the array equal to the given value.", - - "array_remove_all(array, element)") - .with_sql_example( - r#"```sql -> select array_remove_all([1, 2, 2, 3, 2, 1, 4], 2); -+--------------------------------------------------+ -| array_remove_all(List([1,2,2,3,2,1,4]),Int64(2)) | -+--------------------------------------------------+ -| [1, 3, 1, 4] | -+--------------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "element", - "Element to be removed from the array.", - ) - .build() - }) -} - /// Array_remove SQL function pub fn array_remove_inner(args: &[ArrayRef]) -> Result { if args.len() != 2 { diff --git a/datafusion/functions-nested/src/repeat.rs b/datafusion/functions-nested/src/repeat.rs index 2842b91a781b..498781f5b3cc 100644 --- a/datafusion/functions-nested/src/repeat.rs +++ b/datafusion/functions-nested/src/repeat.rs @@ -29,12 +29,12 @@ use arrow_schema::DataType::{LargeList, List}; use arrow_schema::{DataType, Field}; use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( ArrayRepeat, @@ -43,6 +43,34 @@ make_udf_expr_and_func!( "returns an array containing element `count` times.", // doc array_repeat_udf // internal function name ); + +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns an array containing element `count` times.", + syntax_example = "array_repeat(element, count)", + sql_example = r#"```sql +> select array_repeat(1, 3); ++---------------------------------+ +| array_repeat(Int64(1),Int64(3)) | ++---------------------------------+ +| [1, 1, 1] | ++---------------------------------+ +> select array_repeat([1, 2], 2); ++------------------------------------+ +| array_repeat(List([1,2]),Int64(2)) | ++------------------------------------+ +| [[1, 2], [1, 2]] | ++------------------------------------+ +```"#, + argument( + name = "element", + description = "Element expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "count", + description = "Value of how many times to repeat the element." + ) +)] #[derive(Debug)] pub(super) struct ArrayRepeat { signature: Signature, @@ -91,47 +119,10 @@ impl ScalarUDFImpl for ArrayRepeat { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_repeat_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_repeat_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns an array containing element `count` times.", - - "array_repeat(element, count)") - .with_sql_example( - r#"```sql -> select array_repeat(1, 3); -+---------------------------------+ -| array_repeat(Int64(1),Int64(3)) | -+---------------------------------+ -| [1, 1, 1] | -+---------------------------------+ -> select array_repeat([1, 2], 2); -+------------------------------------+ -| array_repeat(List([1,2]),Int64(2)) | -+------------------------------------+ -| [[1, 2], [1, 2]] | -+------------------------------------+ -```"#, - ) - .with_argument( - "element", - "Element expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "count", - "Value of how many times to repeat the element.", - ) - .build() - }) -} - /// Array_repeat SQL function pub fn array_repeat_inner(args: &[ArrayRef]) -> Result { if args.len() != 2 { diff --git a/datafusion/functions-nested/src/replace.rs b/datafusion/functions-nested/src/replace.rs index e971d97dbf2b..0902d1d03647 100644 --- a/datafusion/functions-nested/src/replace.rs +++ b/datafusion/functions-nested/src/replace.rs @@ -27,16 +27,16 @@ use arrow_buffer::{BooleanBufferBuilder, NullBuffer, OffsetBuffer}; use arrow_schema::Field; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use crate::utils::compare_element_to_list; use crate::utils::make_scalar_function; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; // Create static instances of ScalarUDFs for each function make_udf_expr_and_func!(ArrayReplace, @@ -58,6 +58,25 @@ make_udf_expr_and_func!(ArrayReplaceAll, array_replace_all_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Replaces the first occurrence of the specified element with another specified element.", + syntax_example = "array_replace(array, from, to)", + sql_example = r#"```sql +> select array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5); ++--------------------------------------------------------+ +| array_replace(List([1,2,2,3,2,1,4]),Int64(2),Int64(5)) | ++--------------------------------------------------------+ +| [1, 5, 2, 3, 2, 1, 4] | ++--------------------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument(name = "from", description = "Initial element."), + argument(name = "to", description = "Final element.") +)] #[derive(Debug)] pub(super) struct ArrayReplace { signature: Signature, @@ -103,45 +122,30 @@ impl ScalarUDFImpl for ArrayReplace { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_replace_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_replace_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Replaces the first occurrence of the specified element with another specified element.", - - "array_replace(array, from, to)") - .with_sql_example( - r#"```sql -> select array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5); -+--------------------------------------------------------+ -| array_replace(List([1,2,2,3,2,1,4]),Int64(2),Int64(5)) | -+--------------------------------------------------------+ -| [1, 5, 2, 3, 2, 1, 4] | -+--------------------------------------------------------+ +#[user_doc( + doc_section(label = "Array Functions"), + description = "Replaces the first `max` occurrences of the specified element with another specified element.", + syntax_example = "array_replace_n(array, from, to, max)", + sql_example = r#"```sql +> select array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2); ++-------------------------------------------------------------------+ +| array_replace_n(List([1,2,2,3,2,1,4]),Int64(2),Int64(5),Int64(2)) | ++-------------------------------------------------------------------+ +| [1, 5, 5, 3, 2, 1, 4] | ++-------------------------------------------------------------------+ ```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "from", - "Initial element.", - ) - .with_argument( - "to", - "Final element.", - ) - .build() - }) -} - + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument(name = "from", description = "Initial element."), + argument(name = "to", description = "Final element."), + argument(name = "max", description = "Number of first occurrences to replace.") +)] #[derive(Debug)] pub(super) struct ArrayReplaceN { signature: Signature, @@ -187,47 +191,29 @@ impl ScalarUDFImpl for ArrayReplaceN { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_replace_n_doc()) + self.doc() } } -fn get_array_replace_n_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Replaces the first `max` occurrences of the specified element with another specified element.", - - "array_replace_n(array, from, to, max)") - .with_sql_example( - r#"```sql -> select array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2); -+-------------------------------------------------------------------+ -| array_replace_n(List([1,2,2,3,2,1,4]),Int64(2),Int64(5),Int64(2)) | -+-------------------------------------------------------------------+ -| [1, 5, 5, 3, 2, 1, 4] | -+-------------------------------------------------------------------+ +#[user_doc( + doc_section(label = "Array Functions"), + description = "Replaces all occurrences of the specified element with another specified element.", + syntax_example = "array_replace_all(array, from, to)", + sql_example = r#"```sql +> select array_replace_all([1, 2, 2, 3, 2, 1, 4], 2, 5); ++------------------------------------------------------------+ +| array_replace_all(List([1,2,2,3,2,1,4]),Int64(2),Int64(5)) | ++------------------------------------------------------------+ +| [1, 5, 5, 3, 5, 1, 4] | ++------------------------------------------------------------+ ```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "from", - "Initial element.", - ) - .with_argument( - "to", - "Final element.", - ) - .with_argument( - "max", - "Number of first occurrences to replace.", - ) - .build() - }) -} - + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument(name = "from", description = "Initial element."), + argument(name = "to", description = "Final element.") +)] #[derive(Debug)] pub(super) struct ArrayReplaceAll { signature: Signature, @@ -273,43 +259,10 @@ impl ScalarUDFImpl for ArrayReplaceAll { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_replace_all_doc()) + self.doc() } } -fn get_array_replace_all_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Replaces all occurrences of the specified element with another specified element.", - - "array_replace_all(array, from, to)") - .with_sql_example( - r#"```sql -> select array_replace_all([1, 2, 2, 3, 2, 1, 4], 2, 5); -+------------------------------------------------------------+ -| array_replace_all(List([1,2,2,3,2,1,4]),Int64(2),Int64(5)) | -+------------------------------------------------------------+ -| [1, 5, 5, 3, 5, 1, 4] | -+------------------------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "from", - "Initial element.", - ) - .with_argument( - "to", - "Final element.", - ) - .build() - }) -} - /// For each element of `list_array[i]`, replaces up to `arr_n[i]` occurrences /// of `from_array[i]`, `to_array[i]`. /// diff --git a/datafusion/functions-nested/src/resize.rs b/datafusion/functions-nested/src/resize.rs index c9487dd81843..8a4a88741c53 100644 --- a/datafusion/functions-nested/src/resize.rs +++ b/datafusion/functions-nested/src/resize.rs @@ -27,12 +27,12 @@ use arrow_schema::DataType::{FixedSizeList, LargeList, List}; use arrow_schema::{DataType, FieldRef}; use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array}; use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( ArrayResize, @@ -42,6 +42,28 @@ make_udf_expr_and_func!( array_resize_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Resizes the list to contain size elements. Initializes new elements with value or empty if value is not set.", + syntax_example = "array_resize(array, size, value)", + sql_example = r#"```sql +> select array_resize([1, 2, 3], 5, 0); ++-------------------------------------+ +| array_resize(List([1,2,3],5,0)) | ++-------------------------------------+ +| [1, 2, 3, 0, 0] | ++-------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument(name = "size", description = "New size of given array."), + argument( + name = "value", + description = "Defines new elements' value or empty if value is not set." + ) +)] #[derive(Debug)] pub(super) struct ArrayResize { signature: Signature, @@ -93,45 +115,10 @@ impl ScalarUDFImpl for ArrayResize { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_resize_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_resize_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Resizes the list to contain size elements. Initializes new elements with value or empty if value is not set.", - - "array_resize(array, size, value)") - .with_sql_example( - r#"```sql -> select array_resize([1, 2, 3], 5, 0); -+-------------------------------------+ -| array_resize(List([1,2,3],5,0)) | -+-------------------------------------+ -| [1, 2, 3, 0, 0] | -+-------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "size", - "New size of given array.", - ) - .with_argument( - "value", - "Defines new elements' value or empty if value is not set.", - ) - .build() - }) -} - /// array_resize SQL function pub(crate) fn array_resize_inner(arg: &[ArrayRef]) -> Result { if arg.len() < 2 || arg.len() > 3 { diff --git a/datafusion/functions-nested/src/reverse.rs b/datafusion/functions-nested/src/reverse.rs index aa898268d10b..b394c1afeafb 100644 --- a/datafusion/functions-nested/src/reverse.rs +++ b/datafusion/functions-nested/src/reverse.rs @@ -25,12 +25,12 @@ use arrow_schema::DataType::{LargeList, List, Null}; use arrow_schema::{DataType, FieldRef}; use datafusion_common::cast::{as_large_list_array, as_list_array}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( ArrayReverse, @@ -40,6 +40,23 @@ make_udf_expr_and_func!( array_reverse_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns the array with the order of the elements reversed.", + syntax_example = "array_reverse(array)", + sql_example = r#"```sql +> select array_reverse([1, 2, 3, 4]); ++------------------------------------------------------------+ +| array_reverse(List([1, 2, 3, 4])) | ++------------------------------------------------------------+ +| [4, 3, 2, 1] | ++------------------------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ) +)] #[derive(Debug)] pub(super) struct ArrayReverse { signature: Signature, @@ -85,37 +102,10 @@ impl ScalarUDFImpl for ArrayReverse { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_reverse_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_reverse_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns the array with the order of the elements reversed.", - - "array_reverse(array)") - .with_sql_example( - r#"```sql -> select array_reverse([1, 2, 3, 4]); -+------------------------------------------------------------+ -| array_reverse(List([1, 2, 3, 4])) | -+------------------------------------------------------------+ -| [4, 3, 2, 1] | -+------------------------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .build() - }) -} - /// array_reverse SQL function pub fn array_reverse_inner(arg: &[ArrayRef]) -> Result { if arg.len() != 1 { diff --git a/datafusion/functions-nested/src/set_ops.rs b/datafusion/functions-nested/src/set_ops.rs index faefa45e9217..202330715ba0 100644 --- a/datafusion/functions-nested/src/set_ops.rs +++ b/datafusion/functions-nested/src/set_ops.rs @@ -27,15 +27,15 @@ use arrow::row::{RowConverter, SortField}; use arrow_schema::DataType::{FixedSizeList, LargeList, List, Null}; use datafusion_common::cast::{as_large_list_array, as_list_array}; use datafusion_common::{exec_err, internal_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use itertools::Itertools; use std::any::Any; use std::collections::HashSet; use std::fmt::{Display, Formatter}; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; // Create static instances of ScalarUDFs for each function make_udf_expr_and_func!( @@ -62,6 +62,33 @@ make_udf_expr_and_func!( array_distinct_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns an array of elements that are present in both arrays (all elements from both arrays) with out duplicates.", + syntax_example = "array_union(array1, array2)", + sql_example = r#"```sql +> select array_union([1, 2, 3, 4], [5, 6, 3, 4]); ++----------------------------------------------------+ +| array_union([1, 2, 3, 4], [5, 6, 3, 4]); | ++----------------------------------------------------+ +| [1, 2, 3, 4, 5, 6] | ++----------------------------------------------------+ +> select array_union([1, 2, 3, 4], [5, 6, 7, 8]); ++----------------------------------------------------+ +| array_union([1, 2, 3, 4], [5, 6, 7, 8]); | ++----------------------------------------------------+ +| [1, 2, 3, 4, 5, 6, 7, 8] | ++----------------------------------------------------+ +```"#, + argument( + name = "array1", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "array2", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ) +)] #[derive(Debug)] pub(super) struct ArrayUnion { signature: Signature, @@ -111,47 +138,37 @@ impl ScalarUDFImpl for ArrayUnion { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_union_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_union_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns an array of elements that are present in both arrays (all elements from both arrays) with out duplicates.", - - "array_union(array1, array2)") - .with_sql_example( - r#"```sql -> select array_union([1, 2, 3, 4], [5, 6, 3, 4]); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns an array of elements in the intersection of array1 and array2.", + syntax_example = "array_intersect(array1, array2)", + sql_example = r#"```sql +> select array_intersect([1, 2, 3, 4], [5, 6, 3, 4]); +----------------------------------------------------+ -| array_union([1, 2, 3, 4], [5, 6, 3, 4]); | +| array_intersect([1, 2, 3, 4], [5, 6, 3, 4]); | +----------------------------------------------------+ -| [1, 2, 3, 4, 5, 6] | +| [3, 4] | +----------------------------------------------------+ -> select array_union([1, 2, 3, 4], [5, 6, 7, 8]); +> select array_intersect([1, 2, 3, 4], [5, 6, 7, 8]); +----------------------------------------------------+ -| array_union([1, 2, 3, 4], [5, 6, 7, 8]); | +| array_intersect([1, 2, 3, 4], [5, 6, 7, 8]); | +----------------------------------------------------+ -| [1, 2, 3, 4, 5, 6, 7, 8] | +| [] | +----------------------------------------------------+ ```"#, - ) - .with_argument( - "array1", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "array2", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .build() - }) -} - + argument( + name = "array1", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "array2", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ) +)] #[derive(Debug)] pub(super) struct ArrayIntersect { signature: Signature, @@ -201,45 +218,27 @@ impl ScalarUDFImpl for ArrayIntersect { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_intersect_doc()) + self.doc() } } -fn get_array_intersect_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns an array of elements in the intersection of array1 and array2.", - - "array_intersect(array1, array2)") - .with_sql_example( - r#"```sql -> select array_intersect([1, 2, 3, 4], [5, 6, 3, 4]); -+----------------------------------------------------+ -| array_intersect([1, 2, 3, 4], [5, 6, 3, 4]); | -+----------------------------------------------------+ -| [3, 4] | -+----------------------------------------------------+ -> select array_intersect([1, 2, 3, 4], [5, 6, 7, 8]); -+----------------------------------------------------+ -| array_intersect([1, 2, 3, 4], [5, 6, 7, 8]); | -+----------------------------------------------------+ -| [] | -+----------------------------------------------------+ +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns distinct values from the array after removing duplicates.", + syntax_example = "array_distinct(array)", + sql_example = r#"```sql +> select array_distinct([1, 3, 2, 3, 1, 2, 4]); ++---------------------------------+ +| array_distinct(List([1,2,3,4])) | ++---------------------------------+ +| [1, 2, 3, 4] | ++---------------------------------+ ```"#, - ) - .with_argument( - "array1", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "array2", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .build() - }) -} - + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ) +)] #[derive(Debug)] pub(super) struct ArrayDistinct { signature: Signature, @@ -296,35 +295,10 @@ impl ScalarUDFImpl for ArrayDistinct { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_distinct_doc()) + self.doc() } } -fn get_array_distinct_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns distinct values from the array after removing duplicates.", - - "array_distinct(array)") - .with_sql_example( - r#"```sql -> select array_distinct([1, 3, 2, 3, 1, 2, 4]); -+---------------------------------+ -| array_distinct(List([1,2,3,4])) | -+---------------------------------+ -| [1, 2, 3, 4] | -+---------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .build() - }) -} - /// array_distinct SQL function /// example: from list [1, 3, 2, 3, 1, 2, 4] to [1, 2, 3, 4] fn array_distinct_inner(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions-nested/src/sort.rs b/datafusion/functions-nested/src/sort.rs index 043fedd89bf8..0c5309e545f7 100644 --- a/datafusion/functions-nested/src/sort.rs +++ b/datafusion/functions-nested/src/sort.rs @@ -25,12 +25,12 @@ use arrow_schema::DataType::{FixedSizeList, LargeList, List}; use arrow_schema::{DataType, Field, SortOptions}; use datafusion_common::cast::{as_list_array, as_string_array}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( ArraySort, @@ -40,6 +40,31 @@ make_udf_expr_and_func!( array_sort_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Sort array.", + syntax_example = "array_sort(array, desc, nulls_first)", + sql_example = r#"```sql +> select array_sort([3, 1, 2]); ++-----------------------------+ +| array_sort(List([3,1,2])) | ++-----------------------------+ +| [1, 2, 3] | ++-----------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "desc", + description = "Whether to sort in descending order(`ASC` or `DESC`)." + ), + argument( + name = "nulls_first", + description = "Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`)." + ) +)] #[derive(Debug)] pub(super) struct ArraySort { signature: Signature, @@ -96,45 +121,10 @@ impl ScalarUDFImpl for ArraySort { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_sort_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_sort_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Sort array.", - - "array_sort(array, desc, nulls_first)") - .with_sql_example( - r#"```sql -> select array_sort([3, 1, 2]); -+-----------------------------+ -| array_sort(List([3,1,2])) | -+-----------------------------+ -| [1, 2, 3] | -+-----------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "desc", - "Whether to sort in descending order(`ASC` or `DESC`).", - ) - .with_argument( - "nulls_first", - "Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`).", - ) - .build() - }) -} - /// Array_sort SQL function pub fn array_sort_inner(args: &[ArrayRef]) -> Result { if args.is_empty() || args.len() > 3 { diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 9288b374dacb..ee022053cf71 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -42,13 +42,13 @@ use arrow_schema::DataType::{ }; use datafusion_common::cast::{as_large_list_array, as_list_array}; use datafusion_common::exec_err; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; use datafusion_functions::strings::StringArrayType; use datafusion_functions::{downcast_arg, downcast_named_arg}; -use std::sync::{Arc, OnceLock}; +use datafusion_macros::user_doc; +use std::sync::Arc; macro_rules! call_array_function { ($DATATYPE:expr, false) => { @@ -121,6 +121,29 @@ make_udf_expr_and_func!( "converts each element to its text representation.", // doc array_to_string_udf // internal function name ); + +#[user_doc( + doc_section(label = "Array Functions"), + description = "Converts each element to its text representation.", + syntax_example = "array_to_string(array, delimiter[, null_string])", + sql_example = r#"```sql +> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ','); ++----------------------------------------------------+ +| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) | ++----------------------------------------------------+ +| 1,2,3,4,5,6,7,8 | ++----------------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument(name = "delimiter", description = "Array element separator."), + argument( + name = "null_string", + description = "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior." + ) +)] #[derive(Debug)] pub(super) struct ArrayToString { signature: Signature, @@ -175,45 +198,10 @@ impl ScalarUDFImpl for ArrayToString { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_to_string_doc()) + self.doc() } } -static DOCUMENTATION_ARRAY_TO_STRING: OnceLock = OnceLock::new(); - -fn get_array_to_string_doc() -> &'static Documentation { - DOCUMENTATION_ARRAY_TO_STRING.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Converts each element to its text representation.", - - "array_to_string(array, delimiter[, null_string])") - .with_sql_example( - r#"```sql -> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ','); -+----------------------------------------------------+ -| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) | -+----------------------------------------------------+ -| 1,2,3,4,5,6,7,8 | -+----------------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "delimiter", - "Array element separator.", - ) - .with_argument( - "null_string", - "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior.", - ) - .build() - }) -} - make_udf_expr_and_func!( StringToArray, string_to_array, @@ -221,6 +209,32 @@ make_udf_expr_and_func!( "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", // doc string_to_array_udf // internal function name ); + +#[user_doc( + doc_section(label = "Array Functions"), + description = "Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.", + syntax_example = "string_to_array(str, delimiter[, null_str])", + sql_example = r#"```sql +> select string_to_array('abc##def', '##'); ++-----------------------------------+ +| string_to_array(Utf8('abc##def')) | ++-----------------------------------+ +| ['abc', 'def'] | ++-----------------------------------+ +> select string_to_array('abc def', ' ', 'def'); ++---------------------------------------------+ +| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) | ++---------------------------------------------+ +| ['abc', NULL] | ++---------------------------------------------+ +```"#, + argument(name = "str", description = "String expression to split."), + argument(name = "delimiter", description = "Delimiter string to split on."), + argument( + name = "null_str", + description = "Substring values to be replaced with `NULL`." + ) +)] #[derive(Debug)] pub(super) struct StringToArray { signature: Signature, @@ -284,51 +298,10 @@ impl ScalarUDFImpl for StringToArray { } fn documentation(&self) -> Option<&Documentation> { - Some(get_string_to_array_doc()) + self.doc() } } -static DOCUMENTATION_STRING_TO_ARRAY: OnceLock = OnceLock::new(); - -fn get_string_to_array_doc() -> &'static Documentation { - DOCUMENTATION_STRING_TO_ARRAY.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.", - - "string_to_array(str, delimiter[, null_str])") - .with_sql_example( - r#"```sql -> select string_to_array('abc##def', '##'); -+-----------------------------------+ -| string_to_array(Utf8('abc##def')) | -+-----------------------------------+ -| ['abc', 'def'] | -+-----------------------------------+ -> select string_to_array('abc def', ' ', 'def'); -+---------------------------------------------+ -| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) | -+---------------------------------------------+ -| ['abc', NULL] | -+---------------------------------------------+ -```"#, - ) - .with_argument( - "str", - "String expression to split.", - ) - .with_argument( - "delimiter", - "Delimiter string to split on.", - ) - .with_argument( - "null_str", - "Substring values to be replaced with `NULL`.", - ) - .build() - }) -} - /// Array_to_string SQL function pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { if args.len() < 2 || args.len() > 3 { diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 79fe440f377b..4cf5ff4b7142 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2936,25 +2936,32 @@ _Alias of [array_position](#array_position)._ ### `array_intersect` -Returns distinct values from the array after removing duplicates. +Returns an array of elements in the intersection of array1 and array2. ``` -array_distinct(array) +array_intersect(array1, array2) ``` #### Arguments -- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. +- **array1**: Array expression. Can be a constant, column, or function, and any combination of array operators. +- **array2**: Array expression. Can be a constant, column, or function, and any combination of array operators. #### Example ```sql -> select array_distinct([1, 3, 2, 3, 1, 2, 4]); -+---------------------------------+ -| array_distinct(List([1,2,3,4])) | -+---------------------------------+ -| [1, 2, 3, 4] | -+---------------------------------+ +> select array_intersect([1, 2, 3, 4], [5, 6, 3, 4]); ++----------------------------------------------------+ +| array_intersect([1, 2, 3, 4], [5, 6, 3, 4]); | ++----------------------------------------------------+ +| [3, 4] | ++----------------------------------------------------+ +> select array_intersect([1, 2, 3, 4], [5, 6, 7, 8]); ++----------------------------------------------------+ +| array_intersect([1, 2, 3, 4], [5, 6, 7, 8]); | ++----------------------------------------------------+ +| [] | ++----------------------------------------------------+ ``` #### Aliases @@ -3217,10 +3224,10 @@ array_remove(array, element) ### `array_remove_all` -Removes the first element from the array equal to the given value. +Removes all elements from the array equal to the given value. ``` -array_remove(array, element) +array_remove_all(array, element) ``` #### Arguments @@ -3231,12 +3238,12 @@ array_remove(array, element) #### Example ```sql -> select array_remove([1, 2, 2, 3, 2, 1, 4], 2); -+----------------------------------------------+ -| array_remove(List([1,2,2,3,2,1,4]),Int64(2)) | -+----------------------------------------------+ -| [1, 2, 3, 2, 1, 4] | -+----------------------------------------------+ +> select array_remove_all([1, 2, 2, 3, 2, 1, 4], 2); ++--------------------------------------------------+ +| array_remove_all(List([1,2,2,3,2,1,4]),Int64(2)) | ++--------------------------------------------------+ +| [1, 3, 1, 4] | ++--------------------------------------------------+ ``` #### Aliases @@ -3245,26 +3252,27 @@ array_remove(array, element) ### `array_remove_n` -Removes the first element from the array equal to the given value. +Removes the first `max` elements from the array equal to the given value. ``` -array_remove(array, element) +array_remove_n(array, element, max)) ``` #### Arguments - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. - **element**: Element to be removed from the array. +- **max**: Number of first occurrences to remove. #### Example ```sql -> select array_remove([1, 2, 2, 3, 2, 1, 4], 2); -+----------------------------------------------+ -| array_remove(List([1,2,2,3,2,1,4]),Int64(2)) | -+----------------------------------------------+ -| [1, 2, 3, 2, 1, 4] | -+----------------------------------------------+ +> select array_remove_n([1, 2, 2, 3, 2, 1, 4], 2, 2); ++---------------------------------------------------------+ +| array_remove_n(List([1,2,2,3,2,1,4]),Int64(2),Int64(2)) | ++---------------------------------------------------------+ +| [1, 3, 2, 1, 4] | ++---------------------------------------------------------+ ``` #### Aliases @@ -3307,10 +3315,10 @@ array_repeat(element, count) ### `array_replace` -Replaces the first `max` occurrences of the specified element with another specified element. +Replaces the first occurrence of the specified element with another specified element. ``` -array_replace_n(array, from, to, max) +array_replace(array, from, to) ``` #### Arguments @@ -3318,17 +3326,16 @@ array_replace_n(array, from, to, max) - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. - **from**: Initial element. - **to**: Final element. -- **max**: Number of first occurrences to replace. #### Example ```sql -> select array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2); -+-------------------------------------------------------------------+ -| array_replace_n(List([1,2,2,3,2,1,4]),Int64(2),Int64(5),Int64(2)) | -+-------------------------------------------------------------------+ -| [1, 5, 5, 3, 2, 1, 4] | -+-------------------------------------------------------------------+ +> select array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5); ++--------------------------------------------------------+ +| array_replace(List([1,2,2,3,2,1,4]),Int64(2),Int64(5)) | ++--------------------------------------------------------+ +| [1, 5, 2, 3, 2, 1, 4] | ++--------------------------------------------------------+ ``` #### Aliases @@ -3337,10 +3344,10 @@ array_replace_n(array, from, to, max) ### `array_replace_all` -Replaces the first `max` occurrences of the specified element with another specified element. +Replaces all occurrences of the specified element with another specified element. ``` -array_replace_n(array, from, to, max) +array_replace_all(array, from, to) ``` #### Arguments @@ -3348,17 +3355,16 @@ array_replace_n(array, from, to, max) - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. - **from**: Initial element. - **to**: Final element. -- **max**: Number of first occurrences to replace. #### Example ```sql -> select array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2); -+-------------------------------------------------------------------+ -| array_replace_n(List([1,2,2,3,2,1,4]),Int64(2),Int64(5),Int64(2)) | -+-------------------------------------------------------------------+ -| [1, 5, 5, 3, 2, 1, 4] | -+-------------------------------------------------------------------+ +> select array_replace_all([1, 2, 2, 3, 2, 1, 4], 2, 5); ++------------------------------------------------------------+ +| array_replace_all(List([1,2,2,3,2,1,4]),Int64(2),Int64(5)) | ++------------------------------------------------------------+ +| [1, 5, 5, 3, 5, 1, 4] | ++------------------------------------------------------------+ ``` #### Aliases @@ -3543,25 +3549,32 @@ array_to_string(array, delimiter[, null_string]) ### `array_union` -Returns distinct values from the array after removing duplicates. +Returns an array of elements that are present in both arrays (all elements from both arrays) with out duplicates. ``` -array_distinct(array) +array_union(array1, array2) ``` #### Arguments -- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. +- **array1**: Array expression. Can be a constant, column, or function, and any combination of array operators. +- **array2**: Array expression. Can be a constant, column, or function, and any combination of array operators. #### Example ```sql -> select array_distinct([1, 3, 2, 3, 1, 2, 4]); -+---------------------------------+ -| array_distinct(List([1,2,3,4])) | -+---------------------------------+ -| [1, 2, 3, 4] | -+---------------------------------+ +> select array_union([1, 2, 3, 4], [5, 6, 3, 4]); ++----------------------------------------------------+ +| array_union([1, 2, 3, 4], [5, 6, 3, 4]); | ++----------------------------------------------------+ +| [1, 2, 3, 4, 5, 6] | ++----------------------------------------------------+ +> select array_union([1, 2, 3, 4], [5, 6, 7, 8]); ++----------------------------------------------------+ +| array_union([1, 2, 3, 4], [5, 6, 7, 8]); | ++----------------------------------------------------+ +| [1, 2, 3, 4, 5, 6, 7, 8] | ++----------------------------------------------------+ ``` #### Aliases @@ -3657,9 +3670,9 @@ generate_series(start, stop, step) #### Arguments -- **start**: start of the series. Ints, timestamps, dates or string types that can be coerced to Date32 are supported. -- **end**: end of the series (included). Type must be the same as start. -- **step**: increase by step (can not be 0). Steps less than a day are supported only for timestamp ranges. +- **start**: Start of the series. Ints, timestamps, dates or string types that can be coerced to Date32 are supported. +- **end**: End of the series (included). Type must be the same as start. +- **step**: Increase by step (can not be 0). Steps less than a day are supported only for timestamp ranges. #### Example From 383f279982777e5a4306f0072403bb822158b3fc Mon Sep 17 00:00:00 2001 From: Ian Lai <108986288+Chen-Yuan-Lai@users.noreply.github.com> Date: Mon, 30 Dec 2024 03:25:57 +0800 Subject: [PATCH 2/4] doc-gen: migrate scalar functions (array) documentation 2/3 (#13929) * doc-gen: migrate scalar functions (array) documentation 2/3 * fix: import doc and macro, fix typo and update function docs --------- Co-authored-by: Cheng-Yuan-Lai --- datafusion/functions-nested/src/flatten.rs | 49 +++---- datafusion/functions-nested/src/length.rs | 55 +++----- datafusion/functions-nested/src/make_array.rs | 53 +++---- datafusion/functions-nested/src/map.rs | 105 +++++++------- .../functions-nested/src/map_extract.rs | 67 ++++----- datafusion/functions-nested/src/map_keys.rs | 51 +++---- datafusion/functions-nested/src/map_values.rs | 52 +++---- datafusion/functions-nested/src/position.rs | 129 ++++++++---------- .../source/user-guide/sql/scalar_functions.md | 24 ++-- 9 files changed, 243 insertions(+), 342 deletions(-) diff --git a/datafusion/functions-nested/src/flatten.rs b/datafusion/functions-nested/src/flatten.rs index 7cb52ae4c5c9..30bf2fcbf624 100644 --- a/datafusion/functions-nested/src/flatten.rs +++ b/datafusion/functions-nested/src/flatten.rs @@ -26,13 +26,13 @@ use datafusion_common::cast::{ as_generic_list_array, as_large_list_array, as_list_array, }; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( Flatten, @@ -42,6 +42,23 @@ make_udf_expr_and_func!( flatten_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Converts an array of arrays to a flat array.\n\n- Applies to any depth of nested arrays\n- Does not change arrays that are already flat\n\nThe flattened array contains all the elements from all source arrays.", + syntax_example = "flatten(array)", + sql_example = r#"```sql +> select flatten([[1, 2], [3, 4]]); ++------------------------------+ +| flatten(List([1,2], [3,4])) | ++------------------------------+ +| [1, 2, 3, 4] | ++------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ) +)] #[derive(Debug)] pub struct Flatten { signature: Signature, @@ -118,35 +135,9 @@ impl ScalarUDFImpl for Flatten { } fn documentation(&self) -> Option<&Documentation> { - Some(get_flatten_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_flatten_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Converts an array of arrays to a flat array.\n\n- Applies to any depth of nested arrays\n- Does not change arrays that are already flat\n\nThe flattened array contains all the elements from all source arrays.", - - "flatten(array)") - .with_sql_example( - r#"```sql -> select flatten([[1, 2], [3, 4]]); -+------------------------------+ -| flatten(List([1,2], [3,4])) | -+------------------------------+ -| [1, 2, 3, 4] | -+------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .build() - }) -} /// Flatten SQL function pub fn flatten_inner(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions-nested/src/length.rs b/datafusion/functions-nested/src/length.rs index 2f03842cbeeb..70a9188a2c3d 100644 --- a/datafusion/functions-nested/src/length.rs +++ b/datafusion/functions-nested/src/length.rs @@ -25,13 +25,13 @@ use arrow_schema::DataType; use arrow_schema::DataType::{FixedSizeList, LargeList, List, UInt64}; use datafusion_common::cast::{as_generic_list_array, as_int64_array}; use datafusion_common::{exec_err, internal_datafusion_err, plan_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; use datafusion_functions::{downcast_arg, downcast_named_arg}; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( ArrayLength, @@ -41,6 +41,24 @@ make_udf_expr_and_func!( array_length_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns the length of the array dimension.", + syntax_example = "array_length(array, dimension)", + sql_example = r#"```sql +> select array_length([1, 2, 3, 4, 5], 1); ++-------------------------------------------+ +| array_length(List([1,2,3,4,5]), 1) | ++-------------------------------------------+ +| 5 | ++-------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument(name = "dimension", description = "Array dimension.") +)] #[derive(Debug)] pub struct ArrayLength { signature: Signature, @@ -96,41 +114,10 @@ impl ScalarUDFImpl for ArrayLength { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_length_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_length_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns the length of the array dimension.", - - "array_length(array, dimension)") - .with_sql_example( - r#"```sql -> select array_length([1, 2, 3, 4, 5], 1); -+-------------------------------------------+ -| array_length(List([1,2,3,4,5]), 1) | -+-------------------------------------------+ -| 5 | -+-------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "dimension", - "Array dimension.", - ) - .build() - }) -} - /// Array_length SQL function pub fn array_length_inner(args: &[ArrayRef]) -> Result { if args.len() != 1 && args.len() != 2 { diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs index efedd897de87..0283cdd40275 100644 --- a/datafusion/functions-nested/src/make_array.rs +++ b/datafusion/functions-nested/src/make_array.rs @@ -18,9 +18,10 @@ //! [`ScalarUDFImpl`] definitions for `make_array` function. use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use std::vec; +use crate::utils::make_scalar_function; use arrow::array::{ArrayData, Capacities, MutableArrayData}; use arrow_array::{ new_null_array, Array, ArrayRef, GenericListArray, NullArray, OffsetSizeTrait, @@ -33,13 +34,11 @@ use datafusion_common::{plan_err, Result}; use datafusion_expr::binary::{ try_type_union_resolution_with_struct, type_union_resolution, }; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::TypeSignature; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; - -use crate::utils::make_scalar_function; +use datafusion_macros::user_doc; make_udf_expr_and_func!( MakeArray, @@ -48,6 +47,23 @@ make_udf_expr_and_func!( make_array_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns an array using the specified input expressions.", + syntax_example = "make_array(expression1[, ..., expression_n])", + sql_example = r#"```sql +> select make_array(1, 2, 3, 4, 5); ++----------------------------------------------------------+ +| make_array(Int64(1),Int64(2),Int64(3),Int64(4),Int64(5)) | ++----------------------------------------------------------+ +| [1, 2, 3, 4, 5] | ++----------------------------------------------------------+ +```"#, + argument( + name = "expression_n", + description = "Expression to include in the output array. Can be a constant, column, or function, and any combination of arithmetic or string operators." + ) +)] #[derive(Debug)] pub struct MakeArray { signature: Signature, @@ -139,37 +155,10 @@ impl ScalarUDFImpl for MakeArray { } fn documentation(&self) -> Option<&Documentation> { - Some(get_make_array_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_make_array_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns an array using the specified input expressions.", - - "make_array(expression1[, ..., expression_n])") - .with_sql_example( - r#"```sql -> select make_array(1, 2, 3, 4, 5); -+----------------------------------------------------------+ -| make_array(Int64(1),Int64(2),Int64(3),Int64(4),Int64(5)) | -+----------------------------------------------------------+ -| [1, 2, 3, 4, 5] | -+----------------------------------------------------------+ -```"#, - ) - .with_argument( - "expression_n", - "Expression to include in the output array. Can be a constant, column, or function, and any combination of arithmetic or string operators.", - ) - .build() - }) -} - // Empty array is a special case that is useful for many other array functions pub(super) fn empty_array_type() -> DataType { List(Arc::new(Field::new_list_field(DataType::Int64, true))) diff --git a/datafusion/functions-nested/src/map.rs b/datafusion/functions-nested/src/map.rs index d21a19c9fb33..0b098a30b758 100644 --- a/datafusion/functions-nested/src/map.rs +++ b/datafusion/functions-nested/src/map.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::collections::VecDeque; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::ArrayData; use arrow_array::{Array, ArrayRef, MapArray, OffsetSizeTrait, StructArray}; @@ -27,10 +27,10 @@ use arrow_schema::{DataType, Field, SchemaBuilder}; use datafusion_common::utils::{fixed_size_list_to_arrays, list_to_arrays}; use datafusion_common::{exec_err, HashSet, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; use datafusion_expr::{ ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use crate::make_array::make_array; @@ -181,6 +181,50 @@ fn make_map_batch_internal( }) } +#[user_doc( + doc_section(label = "Map Functions"), + description = "Returns an Arrow map with the specified key-value pairs.\n\n\ + The `make_map` function creates a map from two lists: one for keys and one for values. Each key must be unique and non-null.", + syntax_example = "map(key, value)\nmap(key: value)\nmake_map(['key1', 'key2'], ['value1', 'value2'])", + sql_example = r#" +```sql +-- Using map function +SELECT MAP('type', 'test'); +---- +{type: test} + +SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]); +---- +{POST: 41, HEAD: 33, PATCH: } + +SELECT MAP([[1,2], [3,4]], ['a', 'b']); +---- +{[1, 2]: a, [3, 4]: b} + +SELECT MAP { 'a': 1, 'b': 2 }; +---- +{a: 1, b: 2} + +-- Using make_map function +SELECT MAKE_MAP(['POST', 'HEAD'], [41, 33]); +---- +{POST: 41, HEAD: 33} + +SELECT MAKE_MAP(['key1', 'key2'], ['value1', null]); +---- +{key1: value1, key2: } +```"#, + argument( + name = "key", + description = "For `map`: Expression to be used for key. Can be a constant, column, function, or any combination of arithmetic or string operators.\n\ + For `make_map`: The list of keys to be used in the map. Each key must be unique and non-null." + ), + argument( + name = "value", + description = "For `map`: Expression to be used for value. Can be a constant, column, function, or any combination of arithmetic or string operators.\n\ + For `make_map`: The list of values to be mapped to the corresponding keys." + ) +)] #[derive(Debug)] pub struct MapFunc { signature: Signature, @@ -247,65 +291,10 @@ impl ScalarUDFImpl for MapFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_map_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_map_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MAP, - "Returns an Arrow map with the specified key-value pairs.\n\n\ - The `make_map` function creates a map from two lists: one for keys and one for values. Each key must be unique and non-null.", - - "map(key, value)\nmap(key: value)\nmake_map(['key1', 'key2'], ['value1', 'value2'])" - ) - .with_sql_example( - r#" -```sql --- Using map function -SELECT MAP('type', 'test'); ----- -{type: test} - -SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]); ----- -{POST: 41, HEAD: 33, PATCH: } - -SELECT MAP([[1,2], [3,4]], ['a', 'b']); ----- -{[1, 2]: a, [3, 4]: b} - -SELECT MAP { 'a': 1, 'b': 2 }; ----- -{a: 1, b: 2} - --- Using make_map function -SELECT MAKE_MAP(['POST', 'HEAD'], [41, 33]); ----- -{POST: 41, HEAD: 33} - -SELECT MAKE_MAP(['key1', 'key2'], ['value1', null]); ----- -{key1: value1, key2: } -```"#, - ) - .with_argument( - "key", - "For `map`: Expression to be used for key. Can be a constant, column, function, or any combination of arithmetic or string operators.\n\ - For `make_map`: The list of keys to be used in the map. Each key must be unique and non-null." - ) - .with_argument( - "value", - "For `map`: Expression to be used for value. Can be a constant, column, function, or any combination of arithmetic or string operators.\n\ - For `make_map`: The list of values to be mapped to the corresponding keys." - ) - .build() - }) -} - fn get_element_type(data_type: &DataType) -> Result<&DataType> { match data_type { DataType::List(element) => Ok(element.data_type()), diff --git a/datafusion/functions-nested/src/map_extract.rs b/datafusion/functions-nested/src/map_extract.rs index 24f396e741b2..1ade3f67c973 100644 --- a/datafusion/functions-nested/src/map_extract.rs +++ b/datafusion/functions-nested/src/map_extract.rs @@ -26,12 +26,12 @@ use arrow_buffer::OffsetBuffer; use arrow_schema::Field; use datafusion_common::{cast::as_map_array, exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use std::vec; use crate::utils::{get_map_entry_field, make_scalar_function}; @@ -45,6 +45,32 @@ make_udf_expr_and_func!( map_extract_udf ); +#[user_doc( + doc_section(label = "Map Functions"), + description = "Returns a list containing the value for the given key or an empty list if the key is not present in the map.", + syntax_example = "map_extract(map, key)", + sql_example = r#"```sql +SELECT map_extract(MAP {'a': 1, 'b': NULL, 'c': 3}, 'a'); +---- +[1] + +SELECT map_extract(MAP {1: 'one', 2: 'two'}, 2); +---- +['two'] + +SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'y'); +---- +[] +```"#, + argument( + name = "map", + description = "Map expression. Can be a constant, column, or function, and any combination of map operators." + ), + argument( + name = "key", + description = "Key to extract from the map. Can be a constant, column, or function, any combination of arithmetic or string operators, or a named expression of the previously listed." + ) +)] #[derive(Debug)] pub(super) struct MapExtract { signature: Signature, @@ -109,45 +135,10 @@ impl ScalarUDFImpl for MapExtract { } fn documentation(&self) -> Option<&Documentation> { - Some(get_map_extract_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_map_extract_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MAP, - "Returns a list containing the value for the given key or an empty list if the key is not present in the map.", - "map_extract(map, key)") - .with_sql_example( - r#"```sql -SELECT map_extract(MAP {'a': 1, 'b': NULL, 'c': 3}, 'a'); ----- -[1] - -SELECT map_extract(MAP {1: 'one', 2: 'two'}, 2); ----- -['two'] - -SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'y'); ----- -[] -```"#, - ) - .with_argument( - "map", - "Map expression. Can be a constant, column, or function, and any combination of map operators.", - ) - .with_argument( - "key", - "Key to extract from the map. Can be a constant, column, or function, any combination of arithmetic or string operators, or a named expression of the previously listed.", - ) - .build() - }) -} - fn general_map_extract_inner( map_array: &MapArray, query_keys_array: &dyn Array, diff --git a/datafusion/functions-nested/src/map_keys.rs b/datafusion/functions-nested/src/map_keys.rs index 1d19cb8492f0..d3afce3e402e 100644 --- a/datafusion/functions-nested/src/map_keys.rs +++ b/datafusion/functions-nested/src/map_keys.rs @@ -21,13 +21,13 @@ use crate::utils::{get_map_entry_field, make_scalar_function}; use arrow_array::{Array, ArrayRef, ListArray}; use arrow_schema::{DataType, Field}; use datafusion_common::{cast::as_map_array, exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; use datafusion_expr::{ ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( MapKeysFunc, @@ -37,6 +37,24 @@ make_udf_expr_and_func!( map_keys_udf ); +#[user_doc( + doc_section(label = "Map Functions"), + description = "Returns a list of all keys in the map.", + syntax_example = "map_keys(map)", + sql_example = r#"```sql +SELECT map_keys(MAP {'a': 1, 'b': NULL, 'c': 3}); +---- +[a, b, c] + +SELECT map_keys(map([100, 5], [42, 43])); +---- +[100, 5] +```"#, + argument( + name = "map", + description = "Map expression. Can be a constant, column, or function, and any combination of map operators." + ) +)] #[derive(Debug)] pub(crate) struct MapKeysFunc { signature: Signature, @@ -87,37 +105,10 @@ impl ScalarUDFImpl for MapKeysFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_map_keys_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_map_keys_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MAP, - "Returns a list of all keys in the map.", - "map_keys(map)") - .with_sql_example( - r#"```sql -SELECT map_keys(MAP {'a': 1, 'b': NULL, 'c': 3}); ----- -[a, b, c] - -SELECT map_keys(map([100, 5], [42, 43])); ----- -[100, 5] -```"#, - ) - .with_argument( - "map", - "Map expression. Can be a constant, column, or function, and any combination of map operators." - ) - .build() - }) -} - fn map_keys_inner(args: &[ArrayRef]) -> Result { if args.len() != 1 { return exec_err!("map_keys expects single argument"); diff --git a/datafusion/functions-nested/src/map_values.rs b/datafusion/functions-nested/src/map_values.rs index 816ebe74aff0..fb3aec009f50 100644 --- a/datafusion/functions-nested/src/map_values.rs +++ b/datafusion/functions-nested/src/map_values.rs @@ -21,13 +21,13 @@ use crate::utils::{get_map_entry_field, make_scalar_function}; use arrow_array::{Array, ArrayRef, ListArray}; use arrow_schema::{DataType, Field}; use datafusion_common::{cast::as_map_array, exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; use datafusion_expr::{ ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; make_udf_expr_and_func!( MapValuesFunc, @@ -37,6 +37,24 @@ make_udf_expr_and_func!( map_values_udf ); +#[user_doc( + doc_section(label = "Map Functions"), + description = "Returns a list of all values in the map.", + syntax_example = "map_values(map)", + sql_example = r#"```sql +SELECT map_values(MAP {'a': 1, 'b': NULL, 'c': 3}); +---- +[1, , 3] + +SELECT map_values(map([100, 5], [42, 43])); +---- +[42, 43] +```"#, + argument( + name = "map", + description = "Map expression. Can be a constant, column, or function, and any combination of map operators." + ) +)] #[derive(Debug)] pub(crate) struct MapValuesFunc { signature: Signature, @@ -87,38 +105,10 @@ impl ScalarUDFImpl for MapValuesFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_map_values_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_map_values_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MAP, - "Returns a list of all values in the map.", - - "map_values(map)") - .with_sql_example( - r#"```sql -SELECT map_values(MAP {'a': 1, 'b': NULL, 'c': 3}); ----- -[1, , 3] - -SELECT map_values(map([100, 5], [42, 43])); ----- -[42, 43] -```"#, - ) - .with_argument( - "map", - "Map expression. Can be a constant, column, or function, and any combination of map operators." - ) - .build() - }) -} - fn map_values_inner(args: &[ArrayRef]) -> Result { if args.len() != 1 { return exec_err!("map_values expects single argument"); diff --git a/datafusion/functions-nested/src/position.rs b/datafusion/functions-nested/src/position.rs index feacc7006192..f56fdf734c9c 100644 --- a/datafusion/functions-nested/src/position.rs +++ b/datafusion/functions-nested/src/position.rs @@ -19,12 +19,13 @@ use arrow_schema::DataType::{LargeList, List, UInt64}; use arrow_schema::{DataType, Field}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; + use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow_array::types::UInt64Type; use arrow_array::{ @@ -46,6 +47,34 @@ make_udf_expr_and_func!( array_position_udf ); +#[user_doc( + doc_section(label = "Array Functions"), + description = "Returns the position of the first occurrence of the specified element in the array.", + syntax_example = "array_position(array, element)\narray_position(array, element, index)", + sql_example = r#"```sql +> select array_position([1, 2, 2, 3, 1, 4], 2); ++----------------------------------------------+ +| array_position(List([1,2,2,3,1,4]),Int64(2)) | ++----------------------------------------------+ +| 2 | ++----------------------------------------------+ +> select array_position([1, 2, 2, 3, 1, 4], 2, 3); ++----------------------------------------------------+ +| array_position(List([1,2,2,3,1,4]),Int64(2), Int64(3)) | ++----------------------------------------------------+ +| 3 | ++----------------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "element", + description = "Element to search for position in the array." + ), + argument(name = "index", description = "Index at which to start searching.") +)] #[derive(Debug)] pub(super) struct ArrayPosition { signature: Signature, @@ -95,51 +124,10 @@ impl ScalarUDFImpl for ArrayPosition { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_position_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_position_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Returns the position of the first occurrence of the specified element in the array.", - - "array_position(array, element)\narray_position(array, element, index)") - .with_sql_example( - r#"```sql -> select array_position([1, 2, 2, 3, 1, 4], 2); -+----------------------------------------------+ -| array_position(List([1,2,2,3,1,4]),Int64(2)) | -+----------------------------------------------+ -| 2 | -+----------------------------------------------+ -> select array_position([1, 2, 2, 3, 1, 4], 2, 3); -+----------------------------------------------------+ -| array_position(List([1,2,2,3,1,4]),Int64(2), Int64(3)) | -+----------------------------------------------------+ -| 3 | -+----------------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "element", - "Element to search for position in the array.", - ) - .with_argument( - "index", - "Index at which to start searching.", - ) - .build() - }) -} - /// Array_position SQL function pub fn array_position_inner(args: &[ArrayRef]) -> Result { if args.len() < 2 || args.len() > 3 { @@ -224,6 +212,28 @@ make_udf_expr_and_func!( "searches for an element in the array, returns all occurrences.", // doc array_positions_udf // internal function name ); + +#[user_doc( + doc_section(label = "Array Functions"), + description = "Searches for an element in the array, returns all occurrences.", + syntax_example = "array_positions(array, element)", + sql_example = r#"```sql +> select array_positions([1, 2, 2, 3, 1, 4], 2); ++-----------------------------------------------+ +| array_positions(List([1,2,2,3,1,4]),Int64(2)) | ++-----------------------------------------------+ +| [2, 3] | ++-----------------------------------------------+ +```"#, + argument( + name = "array", + description = "Array expression. Can be a constant, column, or function, and any combination of array operators." + ), + argument( + name = "element", + description = "Element to search for position in the array." + ) +)] #[derive(Debug)] pub(super) struct ArrayPositions { signature: Signature, @@ -268,39 +278,10 @@ impl ScalarUDFImpl for ArrayPositions { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_positions_doc()) + self.doc() } } -fn get_array_positions_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_ARRAY, - "Searches for an element in the array, returns all occurrences.", - - "array_positions(array, element)") - .with_sql_example( - r#"```sql -> select array_positions([1, 2, 2, 3, 1, 4], 2); -+-----------------------------------------------+ -| array_positions(List([1,2,2,3,1,4]),Int64(2)) | -+-----------------------------------------------+ -| [2, 3] | -+-----------------------------------------------+ -```"#, - ) - .with_argument( - "array", - "Array expression. Can be a constant, column, or function, and any combination of array operators.", - ) - .with_argument( - "element", - "Element to search for positions in the array.", - ) - .build() - }) -} - /// Array_positions SQL function pub fn array_positions_inner(args: &[ArrayRef]) -> Result { if args.len() != 2 { diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 4cf5ff4b7142..f17e7189a948 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -3122,34 +3122,26 @@ array_position(array, element, index) ### `array_positions` -Returns the position of the first occurrence of the specified element in the array. +Searches for an element in the array, returns all occurrences. ``` -array_position(array, element) -array_position(array, element, index) +array_positions(array, element) ``` #### Arguments - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. - **element**: Element to search for position in the array. -- **index**: Index at which to start searching. #### Example ```sql -> select array_position([1, 2, 2, 3, 1, 4], 2); -+----------------------------------------------+ -| array_position(List([1,2,2,3,1,4]),Int64(2)) | -+----------------------------------------------+ -| 2 | -+----------------------------------------------+ -> select array_position([1, 2, 2, 3, 1, 4], 2, 3); -+----------------------------------------------------+ -| array_position(List([1,2,2,3,1,4]),Int64(2), Int64(3)) | -+----------------------------------------------------+ -| 3 | -+----------------------------------------------------+ +> select array_positions([1, 2, 2, 3, 1, 4], 2); ++-----------------------------------------------+ +| array_positions(List([1,2,2,3,1,4]),Int64(2)) | ++-----------------------------------------------+ +| [2, 3] | ++-----------------------------------------------+ ``` #### Aliases From ab69bb04bbcf1b4fd6b528a8433056e9adae67a8 Mon Sep 17 00:00:00 2001 From: Ian Lai <108986288+Chen-Yuan-Lai@users.noreply.github.com> Date: Mon, 30 Dec 2024 03:35:08 +0800 Subject: [PATCH 3/4] doc-gen: migrate scalar functions (string) documentation 4/4 (#13927) * doc-gen: migrate scalar functions (string) documentation 4/4 * fix: fix typo and update function docs --------- Co-authored-by: Cheng-Yuan-Lai --- datafusion/functions/src/unicode/left.rs | 48 ++++++------- datafusion/functions/src/unicode/lpad.rs | 50 +++++++------- datafusion/functions/src/unicode/right.rs | 48 ++++++------- datafusion/functions/src/unicode/rpad.rs | 54 +++++++-------- datafusion/functions/src/unicode/strpos.rs | 45 ++++++------- datafusion/functions/src/unicode/substr.rs | 55 ++++++++------- .../functions/src/unicode/substrindex.rs | 67 +++++++++---------- datafusion/functions/src/unicode/translate.rs | 45 ++++++------- .../source/user-guide/sql/scalar_functions.md | 2 +- 9 files changed, 186 insertions(+), 228 deletions(-) diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index e583523d84a0..c8fbee4d90d8 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::cmp::Ordering; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, @@ -31,12 +31,28 @@ use datafusion_common::cast::{ }; use datafusion_common::exec_err; use datafusion_common::Result; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "String Functions"), + description = "Returns a specified number of characters from the left side of a string.", + syntax_example = "left(str, n)", + sql_example = r#"```sql +> select left('datafusion', 4); ++-----------------------------------+ +| left(Utf8("datafusion"),Int64(4)) | ++-----------------------------------+ +| data | ++-----------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument(name = "n", description = "Number of characters to return."), + related_udf(name = "right") +)] #[derive(Debug)] pub struct LeftFunc { signature: Signature, @@ -99,36 +115,10 @@ impl ScalarUDFImpl for LeftFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_left_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_left_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STRING, - "Returns a specified number of characters from the left side of a string.", - "left(str, n)", - ) - .with_sql_example( - r#"```sql -> select left('datafusion', 4); -+-----------------------------------+ -| left(Utf8("datafusion"),Int64(4)) | -+-----------------------------------+ -| data | -+-----------------------------------+ -```"#, - ) - .with_standard_argument("str", Some("String")) - .with_argument("n", "Number of characters to return.") - .with_related_udf("right") - .build() - }) -} - /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. /// left('abcde', 2) = 'ab' /// The implementation uses UTF-8 code points as characters diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index f1750d2277ca..0b41071cad9e 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::fmt::Write; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::{ Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, @@ -31,12 +31,32 @@ use crate::strings::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "String Functions"), + description = "Pads the left side of a string with another string to a specified string length.", + syntax_example = "lpad(str, n[, padding_str])", + sql_example = r#"```sql +> select lpad('Dolly', 10, 'hello'); ++---------------------------------------------+ +| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) | ++---------------------------------------------+ +| helloDolly | ++---------------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument(name = "n", description = "String length to pad to."), + argument( + name = "padding_str", + description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" + ), + related_udf(name = "rpad") +)] #[derive(Debug)] pub struct LPadFunc { signature: Signature, @@ -103,34 +123,10 @@ impl ScalarUDFImpl for LPadFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_lpad_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_lpad_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STRING, - "Pads the left side of a string with another string to a specified string length.", - "lpad(str, n[, padding_str])") - .with_sql_example(r#"```sql -> select lpad('Dolly', 10, 'hello'); -+---------------------------------------------+ -| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) | -+---------------------------------------------+ -| helloDolly | -+---------------------------------------------+ -```"#) - .with_standard_argument("str", Some("String")) - .with_argument("n", "String length to pad to.") - .with_argument("padding_str", "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") - .with_related_udf("rpad") - .build() - }) -} - /// Extends the string to length 'length' by prepending the characters fill (a space by default). /// If the string is already longer than length then it is truncated (on the right). /// lpad('hi', 5, 'xy') = 'xyxhi' diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 4e414fbae5cb..3561176f1dd7 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::cmp::{max, Ordering}; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, @@ -31,12 +31,28 @@ use datafusion_common::cast::{ }; use datafusion_common::exec_err; use datafusion_common::Result; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "String Functions"), + description = "Returns a specified number of characters from the right side of a string.", + syntax_example = "right(str, n)", + sql_example = r#"```sql +> select right('datafusion', 6); ++------------------------------------+ +| right(Utf8("datafusion"),Int64(6)) | ++------------------------------------+ +| fusion | ++------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument(name = "n", description = "Number of characters to return."), + related_udf(name = "left") +)] #[derive(Debug)] pub struct RightFunc { signature: Signature, @@ -99,36 +115,10 @@ impl ScalarUDFImpl for RightFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_right_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_right_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STRING, - "Returns a specified number of characters from the right side of a string.", - "right(str, n)", - ) - .with_sql_example( - r#"```sql -> select right('datafusion', 6); -+------------------------------------+ -| right(Utf8("datafusion"),Int64(6)) | -+------------------------------------+ -| fusion | -+------------------------------------+ -```"#, - ) - .with_standard_argument("str", Some("String")) - .with_argument("n", "Number of characters to return") - .with_related_udf("left") - .build() - }) -} - /// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. /// right('abcde', 2) = 'de' /// The implementation uses UTF-8 code points as characters diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index d5a0079c72aa..890230ac4344 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -25,17 +25,37 @@ use arrow::datatypes::DataType; use datafusion_common::cast::as_int64_array; use datafusion_common::DataFusionError; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; use std::fmt::Write; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; +#[user_doc( + doc_section(label = "String Functions"), + description = "Pads the right side of a string with another string to a specified string length.", + syntax_example = "rpad(str, n[, padding_str])", + sql_example = r#"```sql +> select rpad('datafusion', 20, '_-'); ++-----------------------------------------------+ +| rpad(Utf8("datafusion"),Int64(20),Utf8("_-")) | ++-----------------------------------------------+ +| datafusion_-_-_-_-_- | ++-----------------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument(name = "n", description = "String length to pad to."), + argument( + name = "padding_str", + description = "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" + ), + related_udf(name = "lpad") +)] #[derive(Debug)] pub struct RPadFunc { signature: Signature, @@ -122,38 +142,10 @@ impl ScalarUDFImpl for RPadFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_rpad_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_rpad_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STRING, - "Pads the right side of a string with another string to a specified string length.", - "rpad(str, n[, padding_str])") - .with_sql_example(r#"```sql -> select rpad('datafusion', 20, '_-'); -+-----------------------------------------------+ -| rpad(Utf8("datafusion"),Int64(20),Utf8("_-")) | -+-----------------------------------------------+ -| datafusion_-_-_-_-_- | -+-----------------------------------------------+ -```"#) - .with_standard_argument( - "str", - Some("String"), - ) - .with_argument("n", "String length to pad to.") - .with_argument("padding_str", - "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") - .with_related_udf("lpad") - .build() - }) -} - pub fn rpad( args: &[ArrayRef], ) -> Result { diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs index 569af87a4b50..b4bfc2d87627 100644 --- a/datafusion/functions/src/unicode/strpos.rs +++ b/datafusion/functions/src/unicode/strpos.rs @@ -16,18 +16,34 @@ // under the License. use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use crate::strings::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_int_type}; use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray}; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "String Functions"), + description = "Returns the starting position of a specified substring in a string. Positions begin at 1. If the substring does not exist in the string, the function returns 0.", + syntax_example = "strpos(str, substr)", + alternative_syntax = "position(substr in origstr)", + sql_example = r#"```sql +> select strpos('datafusion', 'fus'); ++----------------------------------------+ +| strpos(Utf8("datafusion"),Utf8("fus")) | ++----------------------------------------+ +| 5 | ++----------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument(name = "substr", description = "Substring expression to search for.") +)] #[derive(Debug)] pub struct StrposFunc { signature: Signature, @@ -79,33 +95,10 @@ impl ScalarUDFImpl for StrposFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_strpos_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_strpos_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STRING, - "Returns the starting position of a specified substring in a string. Positions begin at 1. If the substring does not exist in the string, the function returns 0.", - "strpos(str, substr)") - .with_sql_example(r#"```sql -> select strpos('datafusion', 'fus'); -+----------------------------------------+ -| strpos(Utf8("datafusion"),Utf8("fus")) | -+----------------------------------------+ -| 5 | -+----------------------------------------+ -```"#) - .with_standard_argument("str", Some("String")) - .with_argument("substr", "Substring expression to search for.") - .with_alternative_syntax("position(substr in origstr)") - .build() - }) -} - fn strpos(args: &[ArrayRef]) -> Result { match (args[0].data_type(), args[1].data_type()) { (DataType::Utf8, DataType::Utf8) => { diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 687f77dbef5b..df6a50ef1775 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use crate::strings::{make_and_append_view, StringArrayType}; use crate::utils::{make_scalar_function, utf8_to_str_type}; @@ -28,11 +28,34 @@ use arrow::datatypes::DataType; use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, plan_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; - +use datafusion_macros::user_doc; + +#[user_doc( + doc_section(label = "String Functions"), + description = "Extracts a substring of a specified number of characters from a specific starting position in a string.", + syntax_example = "substr(str, start_pos[, length])", + alternative_syntax = "substring(str from start_pos for length)", + sql_example = r#"```sql +> select substr('datafusion', 5, 3); ++----------------------------------------------+ +| substr(Utf8("datafusion"),Int64(5),Int64(3)) | ++----------------------------------------------+ +| fus | ++----------------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument( + name = "start_pos", + description = "Character position to start the substring at. The first character in the string has a position of 1." + ), + argument( + name = "length", + description = "Number of characters to extract. If not specified, returns the rest of the string after the start position." + ) +)] #[derive(Debug)] pub struct SubstrFunc { signature: Signature, @@ -154,34 +177,10 @@ impl ScalarUDFImpl for SubstrFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_substr_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_substr_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STRING, - "Extracts a substring of a specified number of characters from a specific starting position in a string.", - "substr(str, start_pos[, length])") - .with_sql_example(r#"```sql -> select substr('datafusion', 5, 3); -+----------------------------------------------+ -| substr(Utf8("datafusion"),Int64(5),Int64(3)) | -+----------------------------------------------+ -| fus | -+----------------------------------------------+ -```"#) - .with_standard_argument("str", Some("String")) - .with_argument("start_pos", "Character position to start the substring at. The first character in the string has a position of 1.") - .with_argument("length", "Number of characters to extract. If not specified, returns the rest of the string after the start position.") - .with_alternative_syntax("substring(str from start_pos for length)") - .build() - }) -} - /// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) /// substr('alphabet', 3) = 'phabet' /// substr('alphabet', 3, 2) = 'ph' diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index 61cd989bb964..60ccd2204788 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, @@ -26,12 +26,42 @@ use arrow::datatypes::{DataType, Int32Type, Int64Type}; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "String Functions"), + description = r#"Returns the substring from str before count occurrences of the delimiter delim. +If count is positive, everything to the left of the final delimiter (counting from the left) is returned. +If count is negative, everything to the right of the final delimiter (counting from the right) is returned."#, + syntax_example = "substr_index(str, delim, count)", + sql_example = r#"```sql +> select substr_index('www.apache.org', '.', 1); ++---------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(1)) | ++---------------------------------------------------------+ +| www | ++---------------------------------------------------------+ +> select substr_index('www.apache.org', '.', -1); ++----------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(-1)) | ++----------------------------------------------------------+ +| org | ++----------------------------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument( + name = "delim", + description = "The string to find in str to split str." + ), + argument( + name = "count", + description = "The number of times to search for the delimiter. Can be either a positive or negative number." + ) +)] #[derive(Debug)] pub struct SubstrIndexFunc { signature: Signature, @@ -91,41 +121,10 @@ impl ScalarUDFImpl for SubstrIndexFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_substr_index_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_substr_index_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STRING, - r#"Returns the substring from str before count occurrences of the delimiter delim. -If count is positive, everything to the left of the final delimiter (counting from the left) is returned. -If count is negative, everything to the right of the final delimiter (counting from the right) is returned."#, - "substr_index(str, delim, count)") - .with_sql_example(r#"```sql -> select substr_index('www.apache.org', '.', 1); -+---------------------------------------------------------+ -| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(1)) | -+---------------------------------------------------------+ -| www | -+---------------------------------------------------------+ -> select substr_index('www.apache.org', '.', -1); -+----------------------------------------------------------+ -| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(-1)) | -+----------------------------------------------------------+ -| org | -+----------------------------------------------------------+ -```"#) - .with_standard_argument("str", Some("String")) - .with_argument("delim", "The string to find in str to split str.") - .with_argument("count", "The number of times to search for the delimiter. Can be either a positive or negative number.") - .build() - }) -} - /// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. /// SUBSTRING_INDEX('www.apache.org', '.', 1) = www /// SUBSTRING_INDEX('www.apache.org', '.', 2) = www.apache diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 9257b0b04e61..47766ded3add 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, @@ -27,12 +27,31 @@ use unicode_segmentation::UnicodeSegmentation; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "String Functions"), + description = "Translates characters in a string to specified translation characters.", + syntax_example = "translate(str, chars, translation)", + sql_example = r#"```sql +> select translate('twice', 'wic', 'her'); ++--------------------------------------------------+ +| translate(Utf8("twice"),Utf8("wic"),Utf8("her")) | ++--------------------------------------------------+ +| there | ++--------------------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String"), + argument(name = "chars", description = "Characters to translate."), + argument( + name = "translation", + description = "Translation characters. Translation characters replace only characters at the same position in the **chars** string." + ) +)] #[derive(Debug)] pub struct TranslateFunc { signature: Signature, @@ -85,30 +104,10 @@ impl ScalarUDFImpl for TranslateFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_translate_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_translate_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder(DOC_SECTION_STRING,"Translates characters in a string to specified translation characters.","translate(str, chars, translation)") - .with_sql_example(r#"```sql -> select translate('twice', 'wic', 'her'); -+--------------------------------------------------+ -| translate(Utf8("twice"),Utf8("wic"),Utf8("her")) | -+--------------------------------------------------+ -| there | -+--------------------------------------------------+ -```"#) - .with_standard_argument("str", Some("String")) - .with_argument("chars", "Characters to translate.") - .with_argument("translation", "Translation characters. Translation characters replace only characters at the same position in the **chars** string.") - .build() - }) -} - fn invoke_translate(args: &[ArrayRef]) -> Result { match args[0].data_type() { DataType::Utf8View => { diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index f17e7189a948..c4501fff8f78 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1369,7 +1369,7 @@ right(str, n) #### Arguments - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. -- **n**: Number of characters to return +- **n**: Number of characters to return. #### Example From 4d07579e91839a66b44c058636a34bdb14d1f041 Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Mon, 30 Dec 2024 18:26:40 +0800 Subject: [PATCH 4/4] Support explain query when running dfbench with clickbench (#13942) * Support explain query when running dfbench * Address comments --- benchmarks/src/clickbench.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs index 46dd4b18825b..6b7c75ed4bab 100644 --- a/benchmarks/src/clickbench.rs +++ b/benchmarks/src/clickbench.rs @@ -145,6 +145,9 @@ impl RunOpt { ); benchmark_run.write_iter(elapsed, row_count); } + if self.common.debug { + ctx.sql(sql).await?.explain(false, false)?.show().await?; + } } benchmark_run.maybe_write_json(self.output_path.as_ref())?; Ok(())