Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update INITCAP scalar function to support Utf8View #11888

Merged
merged 4 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 62 additions & 19 deletions datafusion/functions/src/string/initcap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::sync::Arc;
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
use arrow::datatypes::DataType;

use datafusion_common::cast::as_generic_string_array;
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
use datafusion_common::{exec_err, Result};
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
Expand All @@ -45,7 +45,7 @@ impl InitcapFunc {
Self {
signature: Signature::uniform(
1,
vec![Utf8, LargeUtf8],
vec![Utf8, LargeUtf8, Utf8View],
Volatility::Immutable,
),
}
Expand Down Expand Up @@ -73,6 +73,9 @@ impl ScalarUDFImpl for InitcapFunc {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(initcap::<i32>, vec![])(args),
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
DataType::Utf8View => {
make_scalar_function(initcap_utf8view::<i32>, vec![])(args)
}
other => {
exec_err!("Unsupported data type {other:?} for function initcap")
}
Expand All @@ -88,28 +91,40 @@ fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
// first map is the iterator, second is for the `Option<_>`
let result = string_array
.iter()
.map(|string| {
string.map(|string: &str| {
let mut char_vector = Vec::<char>::new();
let mut previous_character_letter_or_number = false;
for c in string.chars() {
if previous_character_letter_or_number {
char_vector.push(c.to_ascii_lowercase());
} else {
char_vector.push(c.to_ascii_uppercase());
}
previous_character_letter_or_number = c.is_ascii_uppercase()
|| c.is_ascii_lowercase()
|| c.is_ascii_digit();
}
char_vector.iter().collect::<String>()
})
})
.map(initcap_string)
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}

fn initcap_utf8view<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am considering it might be a bit heavy to make this a macro, we wouldn't have a lot of initcap_* like this.

xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved
let string_view_array = as_string_view_array(&args[0])?;

let result = string_view_array
.iter()
.map(initcap_string)
.collect::<GenericStringArray<T>>();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type is a StringArray instead of a StringViewArray, should I alter this behavior? In previous it was defined here

https://github.com/apache/datafusion/blob/2521043ddcb3895a2010b8e328f3fa10f77fc094/datafusion/functions/src/utils.rs#L45C1-L46C1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current utf8_to_str_type only return Utf8 or LargeUtf8. I think ideally we should support returning Utf8View. But since we are recreating the strings anyway, I'm not sure if StringView will help here.


Ok(Arc::new(result) as ArrayRef)
}

fn initcap_string(string: Option<&str>) -> Option<String> {
string.map(|string: &str| {
let mut char_vector = Vec::<char>::new();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect you could make this faster by creating the vector once and then resetting on each loop -- like

        let mut char_vector = Vec::<char>::new();
    string.map(|string: &str| {
      char_vector.clear();
...
}

let mut previous_character_letter_or_number = false;
for c in string.chars() {
if previous_character_letter_or_number {
char_vector.push(c.to_ascii_lowercase());
} else {
char_vector.push(c.to_ascii_uppercase());
}
previous_character_letter_or_number =
c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit();
}
char_vector.iter().collect::<String>()
})
}

#[cfg(test)]
mod tests {
use crate::string::initcap::InitcapFunc;
Expand Down Expand Up @@ -153,6 +168,34 @@ mod tests {
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"hi THOMAS".to_string()
)))],
Ok(Some("Hi Thomas")),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"".to_string()
)))],
Ok(Some("")),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
Ok(None),
&str,
Utf8,
StringArray
);
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved

Ok(())
}
Expand Down
35 changes: 35 additions & 0 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,41 @@ logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
02)--TableScan: test projection=[column1_utf8view]

### Initcap
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved

# Create a table with lowercase strings
statement ok
CREATE TABLE test_lowercase AS SELECT
lower(column1_utf8) as column1_utf8_lower,
lower(column1_large_utf8) as column1_large_utf8_lower,
lower(column1_utf8view) as column1_utf8view_lower
FROM test;

# Test INITCAP with utf8view, utf8, and largeutf8
# Should not cast anything
query TT
EXPLAIN SELECT
INITCAP(column1_utf8view_lower) as c1,
INITCAP(column1_utf8_lower) as c2,
INITCAP(column1_large_utf8_lower) as c3
FROM test_lowercase;
----
logical_plan
01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3
02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower]

query TTT
SELECT
INITCAP(column1_utf8view_lower) as c1,
INITCAP(column1_utf8_lower) as c2,
INITCAP(column1_large_utf8_lower) as c3
FROM test_lowercase;
----
Andrew Andrew Andrew
Xiangpeng Xiangpeng Xiangpeng
Raphael Raphael Raphael
NULL NULL NULL

statement ok
drop table test;

Expand Down