diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 1d2a9589adfc..4c2bfddf1639 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -209,6 +209,9 @@ config_namespace! { /// specified. The Arrow type system does not have a notion of maximum /// string length and thus DataFusion can not enforce such limits. pub support_varchar_with_length: bool, default = true + + /// When set to true, SQL parser will normalize options value (convert value to lowercase) + pub enable_options_value_normalization: bool, default = true } } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index d056b91c2747..f3894ad1e8ad 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -633,6 +633,7 @@ impl SessionState { ParserOptions { parse_float_as_decimal: sql_parser_options.parse_float_as_decimal, enable_ident_normalization: sql_parser_options.enable_ident_normalization, + enable_options_value_normalization: sql_parser_options.enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, } } diff --git a/datafusion/sql/src/cte.rs b/datafusion/sql/src/cte.rs index 0035dcda6ed7..5ca10aa97a16 100644 --- a/datafusion/sql/src/cte.rs +++ b/datafusion/sql/src/cte.rs @@ -38,7 +38,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Process CTEs from top to bottom for cte in with.cte_tables { // A `WITH` block can't use the same name more than once - let cte_name = self.normalizer.normalize(cte.alias.name.clone()); + let cte_name = self.ident_normalizer.normalize(cte.alias.name.clone()); if planner_context.contains_cte(&cte_name) { return plan_err!( "WITH query name {cte_name:?} specified more than once" diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index d297b2e4df5b..a278e9640234 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -46,7 +46,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // interpret names with '.' as if they were // compound identifiers, but this is not a compound // identifier. (e.g. it is "foo.bar" not foo.bar) - let normalize_ident = self.normalizer.normalize(id); + let normalize_ident = self.ident_normalizer.normalize(id); match schema.field_with_unqualified_name(normalize_ident.as_str()) { Ok(_) => { // found a match without a qualified name, this is a inner table column @@ -97,7 +97,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if ids[0].value.starts_with('@') { let var_names: Vec<_> = ids .into_iter() - .map(|id| self.normalizer.normalize(id)) + .map(|id| self.ident_normalizer.normalize(id)) .collect(); let ty = self .context_provider @@ -111,7 +111,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } else { let ids = ids .into_iter() - .map(|id| self.normalizer.normalize(id)) + .map(|id| self.ident_normalizer.normalize(id)) .collect::>(); // Currently not supporting more than one nested level diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 443cd64a940c..5eafb5409b3e 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -25,7 +25,7 @@ use datafusion_common::{ field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError, }; use datafusion_expr::planner::UserDefinedSQLPlanner; -use sqlparser::ast::TimezoneInfo; +use sqlparser::ast::{TimezoneInfo, Value}; use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; @@ -49,6 +49,7 @@ pub struct ParserOptions { pub parse_float_as_decimal: bool, pub enable_ident_normalization: bool, pub support_varchar_with_length: bool, + pub enable_options_value_normalization: bool, } impl Default for ParserOptions { @@ -57,6 +58,7 @@ impl Default for ParserOptions { parse_float_as_decimal: false, enable_ident_normalization: true, support_varchar_with_length: true, + enable_options_value_normalization: true, } } } @@ -87,6 +89,32 @@ impl IdentNormalizer { } } +/// Value Normalizer +#[derive(Debug)] +pub struct ValueNormalizer { + normalize: bool, +} + +impl Default for ValueNormalizer { + fn default() -> Self { + Self { normalize: true } + } +} + +impl ValueNormalizer { + pub fn new(normalize: bool) -> Self { + Self { normalize } + } + + pub fn normalize(&self, value: Value) -> Result { + if self.normalize { + crate::utils::normalize_value(&value) + } else { + crate::utils::value_to_string(&value) + } + } +} + /// Struct to store the states used by the Planner. The Planner will leverage the states to resolve /// CTEs, Views, subqueries and PREPARE statements. The states include /// Common Table Expression (CTE) provided with WITH clause and @@ -185,7 +213,8 @@ impl PlannerContext { pub struct SqlToRel<'a, S: ContextProvider> { pub(crate) context_provider: &'a S, pub(crate) options: ParserOptions, - pub(crate) normalizer: IdentNormalizer, + pub(crate) ident_normalizer: IdentNormalizer, + pub(crate) value_normalizer: ValueNormalizer, /// user defined planner extensions pub(crate) planners: Vec>, } @@ -207,12 +236,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Create a new query planner pub fn new_with_options(context_provider: &'a S, options: ParserOptions) -> Self { - let normalize = options.enable_ident_normalization; + let ident_normalize = options.enable_ident_normalization; + let options_value_normalize = options.enable_options_value_normalization; SqlToRel { context_provider, options, - normalizer: IdentNormalizer::new(normalize), + ident_normalizer: IdentNormalizer::new(ident_normalize), + value_normalizer: ValueNormalizer::new(options_value_normalize), planners: vec![], } } @@ -227,7 +258,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .iter() .any(|x| x.option == ColumnOption::NotNull); fields.push(Field::new( - self.normalizer.normalize(column.name), + self.ident_normalizer.normalize(column.name), data_type, !not_nullable, )); @@ -266,7 +297,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .sql_to_expr(default_sql_expr.clone(), &empty_schema, planner_context) .map_err(error_desc)?; column_defaults - .push((self.normalizer.normalize(column.name.clone()), default_expr)); + .push((self.ident_normalizer.normalize(column.name.clone()), default_expr)); } } Ok(column_defaults) @@ -281,7 +312,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let plan = self.apply_expr_alias(plan, alias.columns)?; LogicalPlanBuilder::from(plan) - .alias(TableReference::bare(self.normalizer.normalize(alias.name)))? + .alias(TableReference::bare(self.ident_normalizer.normalize(alias.name)))? .build() } @@ -302,7 +333,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let fields = plan.schema().fields().clone(); LogicalPlanBuilder::from(plan) .project(fields.iter().zip(idents.into_iter()).map(|(field, ident)| { - col(field.name()).alias(self.normalizer.normalize(ident)) + col(field.name()).alias(self.ident_normalizer.normalize(ident)) }))? .build() } @@ -428,7 +459,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { None => Ident::new(format!("c{idx}")) }; Ok(Arc::new(Field::new( - self.normalizer.normalize(field_name), + self.ident_normalizer.normalize(field_name), data_type, true, ))) diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs index ee2e35b550f6..fb1d00b7e48a 100644 --- a/datafusion/sql/src/relation/join.rs +++ b/datafusion/sql/src/relation/join.rs @@ -115,7 +115,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { JoinConstraint::Using(idents) => { let keys: Vec = idents .into_iter() - .map(|x| Column::from_name(self.normalizer.normalize(x))) + .map(|x| Column::from_name(self.ident_normalizer.normalize(x))) .collect(); LogicalPlanBuilder::from(left) .join_using(right, join_type, keys)? diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 236403e83d74..21d39f449ac7 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -460,7 +460,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { &[&[plan.schema()]], &plan.using_columns()?, )?; - let name = self.normalizer.normalize(alias); + let name = self.ident_normalizer.normalize(alias); // avoiding adding an alias if the column name is the same. let expr = match &col { Expr::Column(column) if column.name.eq(&name) => col, diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 6cdb2f959cd8..20203eaa12d5 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -65,30 +65,6 @@ fn ident_to_string(ident: &Ident) -> String { normalize_ident(ident.to_owned()) } -fn value_to_string(value: &Value) -> Option { - match value { - Value::SingleQuotedString(s) => Some(s.to_string()), - Value::DollarQuotedString(s) => Some(s.to_string()), - Value::Number(_, _) | Value::Boolean(_) => Some(value.to_string()), - Value::DoubleQuotedString(_) - | Value::EscapedStringLiteral(_) - | Value::NationalStringLiteral(_) - | Value::SingleQuotedByteStringLiteral(_) - | Value::DoubleQuotedByteStringLiteral(_) - | Value::TripleSingleQuotedString(_) - | Value::TripleDoubleQuotedString(_) - | Value::TripleSingleQuotedByteStringLiteral(_) - | Value::TripleDoubleQuotedByteStringLiteral(_) - | Value::SingleQuotedRawStringLiteral(_) - | Value::DoubleQuotedRawStringLiteral(_) - | Value::TripleSingleQuotedRawStringLiteral(_) - | Value::TripleDoubleQuotedRawStringLiteral(_) - | Value::HexStringLiteral(_) - | Value::Null - | Value::Placeholder(_) => None, - } -} - fn object_name_to_string(object_name: &ObjectName) -> String { object_name .0 @@ -880,25 +856,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } }; - let mut options = HashMap::new(); - for (key, value) in statement.options { - let value_string = match value_to_string(&value) { - None => { - return plan_err!("Unsupported Value in COPY statement {}", value); - } - Some(v) => v, - }; - - if !(&key.contains('.')) { - // If config does not belong to any namespace, assume it is - // a format option and apply the format prefix for backwards - // compatibility. - let renamed_key = format!("format.{}", key); - options.insert(renamed_key.to_lowercase(), value_string.to_lowercase()); - } else { - options.insert(key.to_lowercase(), value_string.to_lowercase()); - } - } + let options_map: HashMap = self.parse_options_map(statement.options)?; let maybe_file_type = if let Some(stored_as) = &statement.stored_as { if let Ok(ext_file_type) = self.context_provider.get_file_type(stored_as) { @@ -945,7 +903,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { output_url: statement.target, file_type, partition_by, - options, + options: options_map, })) } @@ -1006,29 +964,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let inline_constraints = calc_inline_constraints_from_columns(&columns); all_constraints.extend(inline_constraints); - let mut options_map = HashMap::::new(); - for (key, value) in options { - if options_map.contains_key(&key) { - return plan_err!("Option {key} is specified multiple times"); - } - - let Some(value_string) = value_to_string(&value) else { - return plan_err!( - "Unsupported Value in CREATE EXTERNAL TABLE statement {}", - value - ); - }; - - if !(&key.contains('.')) { - // If a config does not belong to any namespace, we assume it is - // a format option and apply the format prefix for backwards - // compatibility. - let renamed_key = format!("format.{}", key.to_lowercase()); - options_map.insert(renamed_key, value_string.to_lowercase()); - } else { - options_map.insert(key.to_lowercase(), value_string.to_lowercase()); - } - } + let options_map = self.parse_options_map(options)?; let compression = options_map .get("format.compression") @@ -1080,6 +1016,31 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))) } + fn parse_options_map( + &self, + options: Vec<(String, Value)>) -> Result> { + let mut options_map = HashMap::new(); + for (key, value) in options { + if options_map.contains_key(&key) { + return plan_err!("Option {key} is specified multiple times"); + } + + let value_string = self.value_normalizer.normalize(value)?; + + if !(&key.contains('.')) { + // If config does not belong to any namespace, assume it is + // a format option and apply the format prefix for backwards + // compatibility. + let renamed_key = format!("format.{}", key); + options_map.insert(renamed_key.to_lowercase(), value_string); + } else { + options_map.insert(key.to_lowercase(), value_string); + } + } + + Ok(options_map) + } + /// Generate a plan for EXPLAIN ... that will print out a plan /// /// Note this is the sqlparser explain statement, not the @@ -1187,11 +1148,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // parse value string from Expr let value_string = match &value[0] { SQLExpr::Identifier(i) => ident_to_string(i), - SQLExpr::Value(v) => match value_to_string(v) { - None => { + SQLExpr::Value(v) => match crate::utils::value_to_string(v) { + Err(_) => { return plan_err!("Unsupported Value {}", value[0]); } - Some(v) => v, + Ok(v) => v, }, // for capture signed number e.g. +8, -8 SQLExpr::UnaryOp { op, expr } => match op { @@ -1346,7 +1307,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // If the target table has an alias, use it to qualify the column name if let Some(alias) = &table_alias { datafusion_expr::Expr::Column(Column::new( - Some(self.normalizer.normalize(alias.name.clone())), + Some(self.ident_normalizer.normalize(alias.name.clone())), field.name(), )) } else { @@ -1403,7 +1364,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut value_indices = vec![None; table_schema.fields().len()]; let fields = columns .into_iter() - .map(|c| self.normalizer.normalize(c)) + .map(|c| self.ident_normalizer.normalize(c)) .enumerate() .map(|(i, c)| { let column_index = table_schema diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 2eacbd174fc2..33877838a67b 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -32,7 +32,7 @@ use datafusion_expr::builder::get_unnested_columns; use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs}; use datafusion_expr::{expr_vec_fmt, Expr, ExprSchemable, LogicalPlan}; -use sqlparser::ast::Ident; +use sqlparser::ast::{Ident, Value}; /// Make a best-effort attempt at resolving all columns in the expression tree pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { @@ -263,6 +263,34 @@ pub(crate) fn normalize_ident(id: Ident) -> String { } } +pub(crate) fn normalize_value(value: &Value) -> Result { + value_to_string(value).and_then(|v| Ok(v.to_ascii_lowercase())) +} + +pub(crate) fn value_to_string(value: &Value) -> Result { + match value { + Value::SingleQuotedString(s) => Ok(s.to_string()), + Value::DollarQuotedString(s) => Ok(s.to_string()), + Value::Number(_, _) | Value::Boolean(_) => Ok(value.to_string()), + Value::DoubleQuotedString(_) + | Value::EscapedStringLiteral(_) + | Value::NationalStringLiteral(_) + | Value::SingleQuotedByteStringLiteral(_) + | Value::DoubleQuotedByteStringLiteral(_) + | Value::TripleSingleQuotedString(_) + | Value::TripleDoubleQuotedString(_) + | Value::TripleSingleQuotedByteStringLiteral(_) + | Value::TripleDoubleQuotedByteStringLiteral(_) + | Value::SingleQuotedRawStringLiteral(_) + | Value::DoubleQuotedRawStringLiteral(_) + | Value::TripleSingleQuotedRawStringLiteral(_) + | Value::TripleDoubleQuotedRawStringLiteral(_) + | Value::HexStringLiteral(_) + | Value::Null + | Value::Placeholder(_) => plan_err!("Unsupported Value to normalize {}", value), + } +} + /// The context is we want to rewrite unnest() into InnerProjection->Unnest->OuterProjection /// Given an expression which contains unnest expr as one of its children, /// Try transform depends on unnest type diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index aca0d040bb8d..532e0a9cfa18 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -85,6 +85,7 @@ fn parse_decimals() { parse_float_as_decimal: true, enable_ident_normalization: false, support_varchar_with_length: false, + enable_options_value_normalization: false, }, ); } @@ -139,6 +140,7 @@ fn parse_ident_normalization() { parse_float_as_decimal: false, enable_ident_normalization, support_varchar_with_length: false, + enable_options_value_normalization: false, }, ); if plan.is_ok() {