diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index f3894ad1e8ad..f8a90230b38f 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -633,7 +633,8 @@ impl SessionState { ParserOptions { parse_float_as_decimal: sql_parser_options.parse_float_as_decimal, enable_ident_normalization: sql_parser_options.enable_ident_normalization, - enable_options_value_normalization: sql_parser_options.enable_options_value_normalization, + enable_options_value_normalization: sql_parser_options + .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 5eafb5409b3e..1d7225a08a43 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -25,10 +25,10 @@ use datafusion_common::{ field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError, }; use datafusion_expr::planner::UserDefinedSQLPlanner; -use sqlparser::ast::{TimezoneInfo, Value}; use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; +use sqlparser::ast::{TimezoneInfo, Value}; use datafusion_common::TableReference; use datafusion_common::{ @@ -296,8 +296,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let default_expr = self .sql_to_expr(default_sql_expr.clone(), &empty_schema, planner_context) .map_err(error_desc)?; - column_defaults - .push((self.ident_normalizer.normalize(column.name.clone()), default_expr)); + column_defaults.push(( + self.ident_normalizer.normalize(column.name.clone()), + default_expr, + )); } } Ok(column_defaults) @@ -312,7 +314,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let plan = self.apply_expr_alias(plan, alias.columns)?; LogicalPlanBuilder::from(plan) - .alias(TableReference::bare(self.ident_normalizer.normalize(alias.name)))? + .alias(TableReference::bare( + self.ident_normalizer.normalize(alias.name), + ))? .build() } diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 20203eaa12d5..ba00ac240ce9 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -856,7 +856,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } }; - let options_map: HashMap = self.parse_options_map(statement.options)?; + let options_map: HashMap = + self.parse_options_map(statement.options, true)?; let maybe_file_type = if let Some(stored_as) = &statement.stored_as { if let Ok(ext_file_type) = self.context_provider.get_file_type(stored_as) { @@ -964,7 +965,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let inline_constraints = calc_inline_constraints_from_columns(&columns); all_constraints.extend(inline_constraints); - let options_map = self.parse_options_map(options)?; + let options_map = self.parse_options_map(options, false)?; let compression = options_map .get("format.compression") @@ -1018,10 +1019,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fn parse_options_map( &self, - options: Vec<(String, Value)>) -> Result> { + options: Vec<(String, Value)>, + allow_duplicates: bool, + ) -> Result> { let mut options_map = HashMap::new(); for (key, value) in options { - if options_map.contains_key(&key) { + if !allow_duplicates && options_map.contains_key(&key) { return plan_err!("Option {key} is specified multiple times"); } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 532e0a9cfa18..242c0e91c0cb 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -27,9 +27,11 @@ use datafusion_common::{ assert_contains, DataFusionError, ParamValues, Result, ScalarValue, }; use datafusion_expr::{ + dml::CopyTo, logical_plan::{LogicalPlan, Prepare}, test::function_stub::sum_udaf, - ColumnarValue, ScalarUDF, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, CreateExternalTable, DdlStatement, ScalarUDF, ScalarUDFImpl, + Signature, Volatility, }; use datafusion_functions::{string, unicode}; use datafusion_sql::{ @@ -151,6 +153,68 @@ fn parse_ident_normalization() { } } +#[test] +fn test_parse_options_value_normalization() { + let test_data = [ + ( + "CREATE EXTERNAL TABLE test OPTIONS ('location' 'LoCaTiOn') STORED AS PARQUET LOCATION 'fake_location'", + "CreateExternalTable: Bare { table: \"test\" }", + HashMap::from([("format.location", "LoCaTiOn")]), + false, + ), + ( + "CREATE EXTERNAL TABLE test OPTIONS ('location' 'LoCaTiOn') STORED AS PARQUET LOCATION 'fake_location'", + "CreateExternalTable: Bare { table: \"test\" }", + HashMap::from([("format.location", "location")]), + true, + ), + ( + "COPY test TO 'fake_location' STORED AS PARQUET OPTIONS ('location' 'LoCaTiOn')", + "CopyTo: format=csv output_url=fake_location options: (format.location LoCaTiOn)\n TableScan: test", + HashMap::from([("format.location", "LoCaTiOn")]), + false, + ), + ( + "COPY test TO 'fake_location' STORED AS PARQUET OPTIONS ('location' 'LoCaTiOn')", + "CopyTo: format=csv output_url=fake_location options: (format.location location)\n TableScan: test", + HashMap::from([("format.location", "location")]), + true, + ), + ]; + + for (sql, expected_plan, expected_options, enable_options_value_normalization) in + test_data + { + let plan = logical_plan_with_options( + sql, + ParserOptions { + parse_float_as_decimal: false, + enable_ident_normalization: false, + support_varchar_with_length: false, + enable_options_value_normalization, + }, + ); + if plan.is_ok() { + let plan = plan.unwrap(); + assert_eq!(expected_plan, format!("{plan:?}")); + + match plan { + LogicalPlan::Ddl(DdlStatement::CreateExternalTable( + CreateExternalTable { options, .. }, + )) + | LogicalPlan::Copy(CopyTo { options, .. }) => { + expected_options.iter().for_each(|(k, v)| { + assert_eq!(Some(&v.to_string()), options.get(*k)); + }); + } + _ => panic!("Expected Ddl(CreateExternalTable) but got {:?}", plan), + } + } else { + assert_eq!(expected_plan, plan.unwrap_err().strip_backtrace()); + } + } +} + #[test] fn select_no_relation() { quick_test( diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index acd465a0c021..986002a10795 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -238,6 +238,7 @@ datafusion.optimizer.skip_failed_rules false datafusion.optimizer.top_down_join_key_reordering true datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true +datafusion.sql_parser.enable_options_value_normalization true datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.support_varchar_with_length true @@ -322,6 +323,7 @@ datafusion.optimizer.skip_failed_rules false When set to true, the logical plan datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) +datafusion.sql_parser.enable_options_value_normalization true When set to true, SQL parser will normalize options value (convert value to lowercase) datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 303caef57700..59f6cc1b1561 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -113,6 +113,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.explain.show_sizes | true | When set to true, the explain statement will print the partition sizes | | datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | | datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | -| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | +| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) +| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. |