From 1c5286be689bf53faf669d61e9e62fe6937a69bf Mon Sep 17 00:00:00 2001 From: baishen Date: Tue, 16 Apr 2024 15:30:54 +0800 Subject: [PATCH] feat(query): match function support multiple fields with boost (#15196) * feat(query): match function support multiple fields with boost * fix * fix test --- src/query/catalog/src/plan/pushdown.rs | 13 +- .../tests/it/inverted_index/index_refresh.rs | 4 +- .../ee/tests/it/inverted_index/pruning.rs | 50 ++- src/query/functions/src/lib.rs | 2 +- .../sql/src/planner/semantic/type_check.rs | 328 ++++++++++++++---- .../inverted_index/inverted_index_reader.rs | 27 +- .../fuse/src/pruning/inverted_index_pruner.rs | 2 +- .../04_0000_inverted_index_base.test | 169 ++++++++- 8 files changed, 506 insertions(+), 89 deletions(-) diff --git a/src/query/catalog/src/plan/pushdown.rs b/src/query/catalog/src/plan/pushdown.rs index 87ed5afac618..fb4d9ae08b75 100644 --- a/src/query/catalog/src/plan/pushdown.rs +++ b/src/query/catalog/src/plan/pushdown.rs @@ -15,6 +15,7 @@ use std::fmt::Debug; use databend_common_expression::types::DataType; +use databend_common_expression::types::F32; use databend_common_expression::DataSchema; use databend_common_expression::RemoteExpr; use databend_common_expression::Scalar; @@ -69,12 +70,22 @@ pub struct PrewhereInfo { pub virtual_columns: Option>, } +/// Information about inverted index. #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] pub struct InvertedIndexInfo { + /// The index name. pub index_name: String, + /// The index version. pub index_version: String, + /// The index schema. pub index_schema: DataSchema, - pub query_columns: Vec, + /// The query field names and optional boost value, + /// if boost is set, the score for the field is multiplied by the boost value. + /// For example, if set `title^5.0, description^2.0`, + /// it means that the score for `title` field is multiplied by 5.0, + /// and the score for `description` field is multiplied by 2.0. + pub query_fields: Vec<(String, Option)>, + /// The search query text with query syntax. pub query_text: String, } diff --git a/src/query/ee/tests/it/inverted_index/index_refresh.rs b/src/query/ee/tests/it/inverted_index/index_refresh.rs index a63b1de79e43..1fde5c8f7511 100644 --- a/src/query/ee/tests/it/inverted_index/index_refresh.rs +++ b/src/query/ee/tests/it/inverted_index/index_refresh.rs @@ -126,7 +126,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { let dal = new_fuse_table.get_operator_ref(); let schema = DataSchema::from(table_schema); - let query_columns = vec!["title".to_string(), "content".to_string()]; + let query_fields = vec![("title".to_string(), None), ("content".to_string(), None)]; let index_loc = TableMetaLocationGenerator::gen_inverted_index_location_from_block_location( &block_meta.location.0, @@ -135,7 +135,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { ); let index_reader = - InvertedIndexReader::try_create(dal.clone(), &schema, &query_columns, &index_loc).await?; + InvertedIndexReader::try_create(dal.clone(), &schema, &query_fields, &index_loc).await?; let query = "rust"; let matched_rows = index_reader.do_filter(query, block_meta.row_count)?; diff --git a/src/query/ee/tests/it/inverted_index/pruning.rs b/src/query/ee/tests/it/inverted_index/pruning.rs index bdd7d4e3057d..73fb26d9ce40 100644 --- a/src/query/ee/tests/it/inverted_index/pruning.rs +++ b/src/query/ee/tests/it/inverted_index/pruning.rs @@ -24,6 +24,7 @@ use databend_common_exception::Result; use databend_common_expression::types::number::UInt64Type; use databend_common_expression::types::NumberDataType; use databend_common_expression::types::StringType; +use databend_common_expression::types::F32; use databend_common_expression::DataBlock; use databend_common_expression::DataSchema; use databend_common_expression::FromData; @@ -446,7 +447,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["idiom".to_string()], + query_fields: vec![("idiom".to_string(), None)], query_text: "test".to_string(), }), ..Default::default() @@ -456,7 +457,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["idiom".to_string()], + query_fields: vec![("idiom".to_string(), None)], query_text: "save".to_string(), }), ..Default::default() @@ -466,7 +467,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["idiom".to_string()], + query_fields: vec![("idiom".to_string(), None)], query_text: "one".to_string(), }), ..Default::default() @@ -476,7 +477,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["idiom".to_string()], + query_fields: vec![("idiom".to_string(), None)], query_text: "the".to_string(), }), ..Default::default() @@ -486,7 +487,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["idiom".to_string()], + query_fields: vec![("idiom".to_string(), None)], query_text: "光阴".to_string(), }), ..Default::default() @@ -496,7 +497,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["idiom".to_string()], + query_fields: vec![("idiom".to_string(), None)], query_text: "人生".to_string(), }), ..Default::default() @@ -506,7 +507,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["meaning".to_string()], + query_fields: vec![("meaning".to_string(), None)], query_text: "people".to_string(), }), ..Default::default() @@ -516,7 +517,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["meaning".to_string()], + query_fields: vec![("meaning".to_string(), None)], query_text: "bad".to_string(), }), ..Default::default() @@ -526,7 +527,7 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["meaning".to_string()], + query_fields: vec![("meaning".to_string(), None)], query_text: "黄金".to_string(), }), ..Default::default() @@ -536,11 +537,37 @@ async fn test_block_pruner() -> Result<()> { index_name: index_name.clone(), index_version: index_version.clone(), index_schema: index_schema.clone(), - query_columns: vec!["meaning".to_string()], + query_fields: vec![("meaning".to_string(), None)], query_text: "时间".to_string(), }), ..Default::default() }; + let e11 = PushDownInfo { + inverted_index: Some(InvertedIndexInfo { + index_name: index_name.clone(), + index_version: index_version.clone(), + index_schema: index_schema.clone(), + query_fields: vec![ + ("idiom".to_string(), Some(F32::from(5.0))), + ("meaning".to_string(), Some(F32::from(1.0))), + ], + query_text: "you".to_string(), + }), + ..Default::default() + }; + let e12 = PushDownInfo { + inverted_index: Some(InvertedIndexInfo { + index_name: index_name.clone(), + index_version: index_version.clone(), + index_schema: index_schema.clone(), + query_fields: vec![ + ("idiom".to_string(), Some(F32::from(5.0))), + ("meaning".to_string(), Some(F32::from(1.0))), + ], + query_text: "光阴".to_string(), + }), + ..Default::default() + }; let extras = vec![ (Some(e1), 0, 0), (Some(e2), 2, 2), @@ -552,7 +579,10 @@ async fn test_block_pruner() -> Result<()> { (Some(e8), 4, 4), (Some(e9), 1, 2), (Some(e10), 2, 2), + (Some(e11), 9, 15), + (Some(e12), 2, 2), ]; + for (extra, expected_blocks, expected_rows) in extras { let block_metas = apply_block_pruning( snapshot.clone(), diff --git a/src/query/functions/src/lib.rs b/src/query/functions/src/lib.rs index 61e0a7635c52..bb518ac5a632 100644 --- a/src/query/functions/src/lib.rs +++ b/src/query/functions/src/lib.rs @@ -68,7 +68,7 @@ pub const GENERAL_LAMBDA_FUNCTIONS: [&str; 5] = [ "array_reduce", ]; -pub const GENERAL_SEARCH_FUNCTIONS: [&str; 2] = ["match", "score"]; +pub const GENERAL_SEARCH_FUNCTIONS: [&str; 3] = ["match", "query", "score"]; fn builtin_functions() -> FunctionRegistry { let mut registry = FunctionRegistry::empty(); diff --git a/src/query/sql/src/planner/semantic/type_check.rs b/src/query/sql/src/planner/semantic/type_check.rs index b422b729920a..ebedfea28c34 100644 --- a/src/query/sql/src/planner/semantic/type_check.rs +++ b/src/query/sql/src/planner/semantic/type_check.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; use std::collections::VecDeque; +use std::str::FromStr; use std::sync::Arc; use std::vec; @@ -62,6 +63,7 @@ use databend_common_expression::types::decimal::MAX_DECIMAL256_PRECISION; use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberScalar; +use databend_common_expression::types::F32; use databend_common_expression::ColumnIndex; use databend_common_expression::ConstantFolder; use databend_common_expression::DataField; @@ -895,8 +897,21 @@ impl<'a> TypeChecker<'a> { self.resolve_lambda_function(*span, func_name, &args, lambda) .await? } else if GENERAL_SEARCH_FUNCTIONS.contains(&func_name) { - self.resolve_search_function(*span, func_name, &args) - .await? + match func_name { + "score" => { + self.resolve_score_search_function(*span, func_name, &args) + .await? + } + "match" => { + self.resolve_match_search_function(*span, func_name, &args) + .await? + } + "query" => { + self.resolve_query_search_function(*span, func_name, &args) + .await? + } + _ => unreachable!(), + } } else { // Scalar function let mut new_params: Vec = Vec::with_capacity(params.len()); @@ -1994,41 +2009,51 @@ impl<'a> TypeChecker<'a> { } #[async_backtrace::framed] - async fn resolve_search_function( + async fn resolve_score_search_function( &mut self, span: Span, func_name: &str, args: &[&Expr], ) -> Result> { - if func_name == "score" { - if !args.is_empty() { - return Err(ErrorCode::SemanticError(format!( - "invalid arguments for search function, {} expects 0 argument, but got {}", - func_name, - args.len() - )) - .set_span(span)); - } - let internal_column = - InternalColumn::new(SEARCH_SCORE_COL_NAME, InternalColumnType::SearchScore); - - let internal_column_binding = InternalColumnBinding { - database_name: None, - table_name: None, - internal_column, - }; - let column = self.bind_context.add_internal_column_binding( - &internal_column_binding, - self.metadata.clone(), - false, - )?; - - let scalar_expr = ScalarExpr::BoundColumnRef(BoundColumnRef { span, column }); - let data_type = DataType::Number(NumberDataType::Float32); - return Ok(Box::new((scalar_expr, data_type))); + if !args.is_empty() { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, {} expects 0 argument, but got {}", + func_name, + args.len() + )) + .set_span(span)); } + let internal_column = + InternalColumn::new(SEARCH_SCORE_COL_NAME, InternalColumnType::SearchScore); - // match function + let internal_column_binding = InternalColumnBinding { + database_name: None, + table_name: None, + internal_column, + }; + let column = self.bind_context.add_internal_column_binding( + &internal_column_binding, + self.metadata.clone(), + false, + )?; + + let scalar_expr = ScalarExpr::BoundColumnRef(BoundColumnRef { span, column }); + let data_type = DataType::Number(NumberDataType::Float32); + Ok(Box::new((scalar_expr, data_type))) + } + + /// Resolve match search function. + /// The first argument is the field or fields to match against, + /// multiple fields can have a optional per-field boosting that + /// gives preferential weight to fields being searched in. + /// For example: title^5, content^1.2 + /// The scond argument is the query text without query syntax. + async fn resolve_match_search_function( + &mut self, + span: Span, + func_name: &str, + args: &[&Expr], + ) -> Result> { if !matches!(self.bind_context.expr_context, ExprContext::WhereClause) { return Err(ErrorCode::SemanticError(format!( "search function {} can only be used in where clause", @@ -2050,54 +2075,239 @@ impl<'a> TypeChecker<'a> { let field_arg = args[0]; let query_arg = args[1]; - // TODO: support multiple fields let box (field_scalar, _) = self.resolve(field_arg).await?; - let Ok(column_ref) = BoundColumnRef::try_from(field_scalar) else { - return Err(ErrorCode::SemanticError( - "invalid arguments for search function, field must be a column".to_string(), - ) - .set_span(span)); + let column_refs = match field_scalar { + // single field without boost + ScalarExpr::BoundColumnRef(column_ref) => { + vec![(column_ref, None)] + } + // constant multiple fields with boosts + ScalarExpr::ConstantExpr(constant_expr) => { + let Some(constant_field) = constant_expr.value.as_string() else { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, field must be a column or constant string, but got {}", + constant_expr.value + )) + .set_span(constant_expr.span)); + }; + + // fields are separated by commas and boost is separated by ^ + let field_strs: Vec<&str> = constant_field.split(',').collect(); + let mut column_refs = Vec::with_capacity(field_strs.len()); + for field_str in field_strs { + let field_boosts: Vec<&str> = field_str.split('^').collect(); + if field_boosts.len() > 2 { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, field string must have only one boost, but got {}", + constant_field + )) + .set_span(constant_expr.span)); + } + let column_expr = Expr::ColumnRef { + span: constant_expr.span, + column: ColumnRef { + database: None, + table: None, + column: ColumnID::Name(Identifier::from_name( + constant_expr.span, + field_boosts[0].trim(), + )), + }, + }; + let box (field_scalar, _) = self.resolve(&column_expr).await?; + let Ok(column_ref) = BoundColumnRef::try_from(field_scalar) else { + return Err(ErrorCode::SemanticError( + "invalid arguments for search function, field must be a column" + .to_string(), + ) + .set_span(constant_expr.span)); + }; + let boost = if field_boosts.len() == 2 { + match f32::from_str(field_boosts[1].trim()) { + Ok(boost) => Some(F32::from(boost)), + Err(_) => { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, boost must be a float value, but got {}", + field_boosts[1] + )) + .set_span(constant_expr.span)); + } + } + } else { + None + }; + column_refs.push((column_ref, boost)); + } + column_refs + } + _ => { + return Err(ErrorCode::SemanticError( + "invalid arguments for search function, field must be a column or constant string".to_string(), + ) + .set_span(span)); + } }; - if column_ref.column.table_index.is_none() { - return Err(ErrorCode::SemanticError( - "invalid arguments for search function, column must in a table".to_string(), - ) + + let box (query_scalar, _) = self.resolve(query_arg).await?; + let Ok(query_expr) = ConstantExpr::try_from(query_scalar.clone()) else { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, query text must be a constant string, but got {}", + query_arg + )) + .set_span(query_scalar.span())); + }; + let Some(query_text) = query_expr.value.as_string() else { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, query text must be a constant string, but got {}", + query_arg + )) + .set_span(query_scalar.span())); + }; + + // match function didn't support query syntax, + // convert query text to lowercase and remove punctuation characters, + // so that tantivy query parser can parse the query text as plain text + // without syntax + let formated_query_text: String = query_text + .to_lowercase() + .chars() + .map(|v| if v.is_ascii_punctuation() { ' ' } else { v }) + .collect(); + + self.resolve_search_function(span, column_refs, &formated_query_text) + .await + } + + /// Resolve query search function. + /// The first argument query text with query syntax. + /// The following query syntax is supported: + /// 1. simple terms, like `title:quick` + /// 2. bool operator terms, like `title:fox AND dog OR cat` + /// 3. must and negative operator terms, like `title:+fox -cat` + /// 4. phrase terms, like `title:"quick brown fox"` + /// 5. multiple field with boost terms, like `title:fox^5 content:dog^2` + async fn resolve_query_search_function( + &mut self, + span: Span, + func_name: &str, + args: &[&Expr], + ) -> Result> { + if !matches!(self.bind_context.expr_context, ExprContext::WhereClause) { + return Err(ErrorCode::SemanticError(format!( + "search function {} can only be used in where clause", + func_name + )) .set_span(span)); } - let table_index = column_ref.column.table_index.unwrap(); - let table_entry = self.metadata.read().table(table_index).clone(); - let table = table_entry.table(); - let table_info = table.get_table_info(); - - let table_schema = table_info.schema(); - let table_indexes = &table_info.meta.indexes; + // TODO: support options field + if args.len() != 1 { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, {} expects 1 argument, but got {}", + func_name, + args.len() + )) + .set_span(span)); + } - let column_name = column_ref.column.column_name; - let column_id = table_schema.column_id_of(&column_name)?; + let query_arg = args[0]; let box (query_scalar, _) = self.resolve(query_arg).await?; - let Ok(query_expr) = ConstantExpr::try_from(query_scalar) else { + let Ok(query_expr) = ConstantExpr::try_from(query_scalar.clone()) else { return Err(ErrorCode::SemanticError(format!( "invalid arguments for search function, query text must be a constant string, but got {}", query_arg )) - .set_span(span)); + .set_span(query_scalar.span())); }; let Some(query_text) = query_expr.value.as_string() else { return Err(ErrorCode::SemanticError(format!( "invalid arguments for search function, query text must be a constant string, but got {}", query_arg )) - .set_span(span)); + .set_span(query_scalar.span())); }; + let field_strs: Vec<&str> = query_text.split(' ').collect(); + let mut column_refs = Vec::with_capacity(field_strs.len()); + for field_str in field_strs { + if !field_str.contains(':') { + continue; + } + let field_names: Vec<&str> = field_str.split(':').collect(); + let column_expr = Expr::ColumnRef { + span: query_scalar.span(), + column: ColumnRef { + database: None, + table: None, + column: ColumnID::Name(Identifier::from_name( + query_scalar.span(), + field_names[0].trim(), + )), + }, + }; + let box (field_scalar, _) = self.resolve(&column_expr).await?; + let Ok(column_ref) = BoundColumnRef::try_from(field_scalar) else { + return Err(ErrorCode::SemanticError( + "invalid arguments for search function, field must be a column".to_string(), + ) + .set_span(query_scalar.span())); + }; + column_refs.push((column_ref, None)); + } + + self.resolve_search_function(span, column_refs, query_text) + .await + } + + async fn resolve_search_function( + &mut self, + span: Span, + column_refs: Vec<(BoundColumnRef, Option)>, + query_text: &String, + ) -> Result> { + if column_refs.is_empty() { + return Err(ErrorCode::SemanticError( + "invalid arguments for search function, must specify at least one search column" + .to_string(), + ) + .set_span(span)); + } + if !column_refs.windows(2).all(|c| { + c[0].0.column.table_index.is_some() + && c[0].0.column.table_index == c[1].0.column.table_index + }) { + return Err(ErrorCode::SemanticError( + "invalid arguments for search function, all columns must in a table".to_string(), + ) + .set_span(span)); + } + let table_index = column_refs[0].0.column.table_index.unwrap(); + + let table_entry = self.metadata.read().table(table_index).clone(); + let table = table_entry.table(); + let table_info = table.get_table_info(); + let table_schema = table_info.schema(); + let table_indexes = &table_info.meta.indexes; + + let mut query_fields = Vec::with_capacity(column_refs.len()); + let mut column_ids = Vec::with_capacity(column_refs.len()); + for (column_ref, boost) in &column_refs { + let column_name = &column_ref.column.column_name; + let column_id = table_schema.column_id_of(column_name)?; + column_ids.push(column_id); + query_fields.push((column_name.clone(), *boost)); + } + // find inverted index and check schema let mut index_name = "".to_string(); let mut index_version = "".to_string(); let mut index_schema = None; for table_index in table_indexes.values() { - if table_index.column_ids.contains(&column_id) { + if column_ids + .iter() + .all(|id| table_index.column_ids.contains(id)) + { index_name = table_index.name.clone(); index_version = table_index.version.clone(); @@ -2113,13 +2323,13 @@ impl<'a> TypeChecker<'a> { } if index_schema.is_none() { + let column_names = query_fields.iter().map(|c| c.0.clone()).join(", "); return Err(ErrorCode::SemanticError(format!( - "column {} don't have inverted index", - column_name + "columns {} don't have inverted index", + column_names )) .set_span(span)); } - let query_columns = vec![column_name.clone()]; if self .bind_context @@ -2135,7 +2345,7 @@ impl<'a> TypeChecker<'a> { index_name, index_version, index_schema: index_schema.unwrap(), - query_columns, + query_fields, query_text: query_text.to_string(), }; @@ -2147,8 +2357,8 @@ impl<'a> TypeChecker<'a> { InternalColumn::new(SEARCH_MATCHED_COL_NAME, InternalColumnType::SearchMatched); let internal_column_binding = InternalColumnBinding { - database_name: column_ref.column.database_name, - table_name: column_ref.column.table_name, + database_name: column_refs[0].0.column.database_name.clone(), + table_name: column_refs[0].0.column.table_name.clone(), internal_column, }; let column = self.bind_context.add_internal_column_binding( diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs index dc81749e1121..1059121cd7c2 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs @@ -23,6 +23,7 @@ use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::Field; use tantivy::Index; +use tantivy::Score; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_filter; use crate::io::write::create_tokenizer_manager; @@ -30,6 +31,7 @@ use crate::io::write::create_tokenizer_manager; #[derive(Clone)] pub struct InvertedIndexReader { fields: Vec, + field_boosts: Vec<(Field, Score)>, directory: Arc, } @@ -37,19 +39,26 @@ impl InvertedIndexReader { pub async fn try_create( dal: Operator, schema: &DataSchema, - query_columns: &Vec, + query_fields: &Vec<(String, Option)>, index_loc: &str, ) -> Result { - let mut fields = Vec::with_capacity(query_columns.len()); - for column_name in query_columns { - let i = schema.index_of(column_name)?; + let mut fields = Vec::with_capacity(query_fields.len()); + let mut field_boosts = Vec::with_capacity(query_fields.len()); + for (field_name, boost) in query_fields { + let i = schema.index_of(field_name)?; let field = Field::from_field_id(i as u32); fields.push(field); + if let Some(boost) = boost { + field_boosts.push((field, boost.0)); + } } - let directory = load_inverted_index_filter(dal.clone(), index_loc.to_string()).await?; - Ok(Self { fields, directory }) + Ok(Self { + fields, + field_boosts, + directory, + }) } // Filter the rows and scores in the block that can match the query text, @@ -62,7 +71,11 @@ impl InvertedIndexReader { let reader = index.reader()?; let searcher = reader.searcher(); - let query_parser = QueryParser::for_index(&index, self.fields.clone()); + let mut query_parser = QueryParser::for_index(&index, self.fields.clone()); + // set optional boost value for the field + for (field, boost) in &self.field_boosts { + query_parser.set_field_boost(*field, *boost); + } let query = query_parser.parse_query(query)?; let collector = TopDocs::with_limit(row_count as usize); diff --git a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs index c9c0777d32a9..43e8c3c70a86 100644 --- a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs +++ b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs @@ -76,7 +76,7 @@ impl InvertedIndexPruner { let inverted_index_reader = InvertedIndexReader::try_create( self.dal.clone(), &self.inverted_index_info.index_schema, - &self.inverted_index_info.query_columns, + &self.inverted_index_info.query_fields, &index_loc, ) .await?; diff --git a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test index eae20c47b814..0e89ff1de74a 100644 --- a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test +++ b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test @@ -67,11 +67,6 @@ SELECT id, score(), content FROM t WHERE match(content, 'word') 2 1.5948367 A picture is worth a thousand words 4 1.6550698 Actions speak louder than words -query T -SELECT id, score(), content FROM t WHERE match(content, '"early bird"') ----- -3 4.503372 The early bird catches the worm - statement ok INSERT INTO t VALUES (11, '我喜欢在周末的时候去公园散步,感受大自然的美丽。'), @@ -194,13 +189,13 @@ INSERT INTO t_small_blocks VALUES (9, 'You can not judge a book by its cover'), (10, 'An apple a day keeps the doctor away') -query T -SELECT id, content FROM t WHERE match(content, '"early bird"') +query IT +SELECT id, content FROM t WHERE query('content:"early bird"') ---- 3 The early bird catches the worm query T -EXPLAIN SELECT id, content FROM t WHERE match(content, '"early bird"') +EXPLAIN SELECT id, content FROM t WHERE query('content:"early bird"') ---- Filter ├── output columns: [t.id (#0), t.content (#1)] @@ -217,8 +212,166 @@ Filter ├── push downs: [filters: [t._search_matched (#2)], limit: NONE] └── estimated rows: 29.00 +statement ok +CREATE TABLE books( + id int, + title string, + author string, + description string +) + +statement ok +CREATE INVERTED INDEX IF NOT EXISTS idx1 ON books(title, author, description) tokenizer = 'chinese' + +statement ok +INSERT INTO books VALUES +(1, '这就是ChatGPT', '[美]斯蒂芬·沃尔弗拉姆(Stephen Wolfram)', 'ChatGPT是OpenAI开发的人工智能聊天机器人程序,于2022年11月推出。它能够自动生成一些表面上看起来像人类写的文字,这是一件很厉害且出乎大家意料的事。那么,它是如何做到的呢?又为何能做到呢?本书会大致介绍ChatGPT的内部机制,然后探讨一下为什么它能很好地生成我们认为有意义的文本。'), +(2, 'Python深度学习(第2版)', '[美]弗朗索瓦·肖莱(François Chollet)', '本书由流行深度学习框架Keras之父弗朗索瓦·肖莱执笔,通过直观的解释和丰富的示例帮助你构建深度学习知识体系。作者避免使用数学符号,转而采用Python代码来解释深度学习的核心思想。全书共计14章,既涵盖了深度学习的基本原理,又体现了这一迅猛发展的领域在近几年里取得的重要进展,包括Transformer架构的原理和示例。读完本书后,你将能够使用Keras解决从计算机视觉到自然语言处理等现实世界的诸多问题,包括图像分类、图像分割、时间序列预测、文本分类、机器翻译、文本生成等。'), +(3, '大模型应用开发极简入门', '[比]奥利维耶·卡埃朗(Olivier Caelen)', '本书为大模型应用开发极简入门手册,为初学者提供了一份清晰、全面的“最小可用知识”,带领大家快速了解GPT-4和ChatGPT的工作原理及优势,并在此基础上使用流行的Python编程语言构建大模型应用。通过本书,你不仅可以学会如何构建文本生成、问答和内容摘要等初阶大模型应用,还能了解到提示工程、模型微调、插件、LangChain等高阶实践技术。书中提供了简单易学的示例,帮你理解并应用在自己的项目中。此外,书后还提供了一份术语表,方便你随时参考。'), +(4, '白话深度学习的数学', '[日]立石贤吾', '本书通过想要学习深度学习的程序员绫乃和她朋友美绪的对话,逐步讲解深度学习中实用的数学基础知识。内容涉及神经网络的结构、感知机、正向传播和反向传播,以及卷积神经网络。其中,重点讲解了容易成为学习绊脚石的数学公式和符号。同时,还通过实际的Python 编程实现神经网络,加深读者对相关数学知识的理解。'), +(5, 'BERT基础教程:Transformer大模型实战', '[印] 苏达哈尔桑·拉维昌迪兰(Sudharsan Ravichandiran)', '本书聚焦谷歌公司开发的BERT自然语言处理模型,由浅入深地介绍了BERT的工作原理、BERT的各种变体及其应用。本书呈现了大量示意图、代码和实例,详细解析了如何训练BERT模型、如何使用BERT模型执行自然语言推理任务、文本摘要任务、问答任务、命名实体识别任务等各种下游任务,以及如何将BERT模型应用于多种语言。通读本书后,读者不仅能够全面了解有关BERT的各种概念、术语和原理,还能够使用BERT模型及其变体执行各种自然语言处理任务。'), +(6, 'Flask Web开发:基于Python的Web应用开发实战(第2版)', '[美]米格尔•格林贝格(Miguel Grinberg)', '本书共分三部分,全面介绍如何基于Python微框架Flask进行Web开发。第一部分是Flask简介,介绍使用Flask框架及扩展开发Web程序的必备基础知识。第二部分则给出一个实例,真正带领大家一步步开发完整的博客和社交应用Flasky,从而将前述知识融会贯通,付诸实践。第三部分介绍了发布应用之前必须考虑的事项,如单元测试策略、性能分析技术、Flask程序的部署方式等。第2版针对Python 3.6全面修订。'), +(7, 'Apache Pulsar实战', '[美]戴维·克杰鲁姆加德(David Kjerrumgaard)', 'Apache Pulsar被誉为下一代分布式消息系统,旨在打通发布/ 订阅式消息传递和流数据分析。本书作者既与Pulsar项目创始成员共事多年,又有在生产环境中使用Pulsar 的丰富经验。正是这些宝贵的经验成就了这本Pulsar“避坑指南”,为想轻松上手Pulsar的读者铺平了学习之路。本书分为三大部分,共有12章。第一部分概述Pulsar的设计理念和用途。第二部分介绍Pulsar的特性。第三部分以一个虚构的外卖应用程序为例,详细地介绍Pulsar Functions框架的用法,并展示如何用它实现常见的微服务设计模式。本书示例采用Java语言,并同时提供Python实现。'), +(8, 'Rust程序设计(第2版)', '[美]吉姆 • 布兰迪(Jim Blandy)', '本书是Rust领域经典参考书,由业内资深系统程序员编写,广受读者好评。书中全面介绍了Rust这种新型系统编程语言——具有无与伦比的安全性,兼具C和C++的高性能,并大大简化了并发程序的编写。第2版对上一版内容进行了重组和完善,新增了对“异步编程”的介绍。借助书中的大量案例,你也能用Rust编写出兼顾安全性与高性能的程序。本书内容包括基本数据类型、所有权、引用、表达式、错误处理、crate与模块、结构、枚举与模式等基础知识,以及特型与泛型、闭包、迭代器、集合、字符串与文本、输入与输出、并发、异步编程、宏等进阶知识。'), +(9, 'Vue.js设计与实现', '霍春阳(HcySunYang)', '本书基于Vue.js 3,从规范出发,以源码为基础,并结合大量直观的配图,循序渐进地讲解Vue.js中各个功能模块的实现,细致剖析框架设计原理。全书共18章,分为六篇,主要内容包括:框架设计概览、响应系统、渲染器、组件化、编译器和服务端渲染等。通过阅读本书,对Vue.js 2/3具有上手经验的开发人员能够进一步理解Vue.js框架的实现细节,没有Vue.js使用经验但对框架设计感兴趣的前端开发人员,能够快速掌握Vue.js的设计原理。'), +(10, '前端架构设计', '[美]迈卡·高保特(Micah Godbolt)', '本书展示了一名成熟的前端架构师对前端开发全面而深刻的理解。作者结合自己在Red Hat公司的项目实战经历,探讨了前端架构原则和前端架构的核心内容,包括工作流程、测试流程和文档记录,以及作为前端架构师所要承担的具体开发工作,包括HTML、JavaScript和CSS等。'), +(11, 'OpenAI GPT For Python Developers, 2nd Edition', 'Aymen El Amri', 'The knowledge you’ll acquire from this guide will be applicable to the current families of GPT models (GPT-3, GPT-3.5, GPT-4, etc.) and will likely also be relevant to GPT-5, should it ever be released.'), +(12, 'Developing Apps with GPT-4 and ChatGPT', 'Olivier Caelen, Marie-Alice Blete', 'This minibook is a comprehensive guide for Python developers who want to learn how to build applications with large language models. Authors Olivier Caelen and Marie-Alice Blete cover the main features and benefits of GPT-4 and ChatGPT and explain how they work. You’ll also get a step-by-step guide for developing applications using the GPT-4 and ChatGPT Python library, including text generation, Q&A, and content summarization tools.'), +(13, 'Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT', 'Leo Porter, Daniel Zingaro', 'AI has changed the way we write computer programs. With tools like Copilot and ChatGPT, you can describe what you want in plain English, and watch your AI assistant generate the code right before your eyes. It’s perfect for beginners, or anyone who’s struggled with the steep learning curve of traditional programming.'), +(14, 'Building Recommendation Systems in Python and JAX', 'Bryan Bischof, Hector Yee', 'Implementing and designing systems that make suggestions to users are among the most popular and essential machine learning applications available. Whether you want customers to find the most appealing items at your online store, videos to enrich and entertain them, or news they need to know, recommendation systems (RecSys) provide the way.'), +(15, 'Code Like a Pro in Rust', 'Brenden Matthews', 'Code Like a Pro in Rust dives deep into memory management, asynchronous programming, and the core Rust skills that make you a Rust pro! Plus, you’ll find essential productivity techniques for Rust testing, tooling, and project management. You’ll soon be writing high-quality code that needs way less maintenance overhead.'), +(16, 'Rust Atomics and Locks', 'Mara Bos', 'The Rust programming language is extremely well suited for concurrency, and its ecosystem has many libraries that include lots of concurrent data structures, locks, and more. But implementing those structures correctly can be very difficult. Even in the most well-used libraries, memory ordering bugs are not uncommon.'), +(17, 'Rust for Rustaceans', 'Jon Gjengset', 'For developers who’ve mastered the basics, this book is the next step on your way to professional-level programming in Rust. It covers everything you need to build and maintain larger code bases, write powerful and flexible applications and libraries, and confidently expand the scope and complexity of your projects.'), +(18, 'Database Internals', 'Alex Petrov', 'When it comes to choosing, using, and maintaining a database, understanding its internals is essential. But with so many distributed databases and tools available today, it’s often difficult to understand what each one offers and how they differ. With this practical guide, Alex Petrov guides developers through the concepts behind modern database and storage engine internals.'), +(19, 'Time Series Databases', 'Ted Dunning, Ellen Friedman', 'Time series data is of growing importance, especially with the rapid expansion of the Internet of Things. This concise guide shows you effective ways to collect, persist, and access large-scale time series data for analysis. You’ll explore the theory behind time series databases and learn practical methods for implementing them. Authors Ted Dunning and Ellen Friedman provide a detailed examination of open source tools such as OpenTSDB and new modifications that greatly speed up data ingestion.'), +(20, 'CockroachDB: The Definitive Guide', 'Guy Harrison, Jesse Seldess, Ben Darnell', 'Get the lowdown on CockroachDB, the distributed SQL database built to handle the demands of today’s data-driven cloud applications. In this hands-on guide, software developers, architects, and DevOps/SRE teams will learn how to use CockroachDB to create applications that scale elastically and provide seamless delivery for end users while remaining indestructible. Teams will also learn how to migrate existing applications to CockroachDB’s performant, cloud-native data architecture.') + +query IFT +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY score() DESC +---- +2 8.500097 Python深度学习(第2版) +6 6.7982116 Flask Web开发:基于Python的Web应用开发实战(第2版) +14 5.509352 Building Recommendation Systems in Python and JAX +11 5.263399 OpenAI GPT For Python Developers, 2nd Edition +13 4.4659142 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +12 1.8816761 Developing Apps with GPT-4 and ChatGPT +4 1.5154111 白话深度学习的数学 +3 1.3515654 大模型应用开发极简入门 +7 1.2369337 Apache Pulsar实战 + +query IFT +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'ChatGPT') ORDER BY score() DESC +---- +1 14.471097 这就是ChatGPT +12 10.599274 Developing Apps with GPT-4 and ChatGPT +13 7.9292374 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +3 1.77537 大模型应用开发极简入门 + +query IFT +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计') ORDER BY score() DESC +---- +9 14.486509 Vue.js设计与实现 +10 10.238626 前端架构设计 +8 9.061771 Rust程序设计(第2版) +7 3.2078874 Apache Pulsar实战 + +query IFT +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计 实现') ORDER BY score() DESC +---- +9 32.441788 Vue.js设计与实现 +10 10.238626 前端架构设计 +8 9.061771 Rust程序设计(第2版) +7 5.9086094 Apache Pulsar实战 +4 2.3153453 白话深度学习的数学 + +query IFT +SELECT id, score(), title FROM books WHERE query('title:python') ORDER BY score() DESC +---- +2 1.4378065 Python深度学习(第2版) +14 1.1018704 Building Recommendation Systems in Python and JAX +11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition +6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) +13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT + +query IFT +SELECT id, score(), title FROM books WHERE query('title:python OR rust') ORDER BY score() DESC +---- +17 1.8827661 Rust for Rustaceans +16 1.6531605 Rust Atomics and Locks +8 1.5581512 Rust程序设计(第2版) +2 1.4378065 Python深度学习(第2版) +15 1.3975171 Code Like a Pro in Rust +14 1.1018704 Building Recommendation Systems in Python and JAX +11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition +6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) +13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT + +query IFT +SELECT id, score(), title FROM books WHERE query('title:python AND rust') ORDER BY score() DESC +---- + +query IFT +SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY score() DESC +---- +9 5.063791 Vue.js设计与实现 +7 2.189928 Apache Pulsar实战 +5 1.7138567 BERT基础教程:Transformer大模型实战 +6 1.2924166 Flask Web开发:基于Python的Web应用开发实战(第2版) + +query IFT +SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY score() DESC +---- +16 5.0420737 Rust Atomics and Locks + +query IFT +SELECT id, score(), title FROM books WHERE query('title:"Python深度学习"') ORDER BY score() DESC +---- +2 6.005718 Python深度学习(第2版) + +query IFT +SELECT id, score(), title FROM books WHERE query('title:(+python -学习)') ORDER BY score() DESC +---- +14 1.1018704 Building Recommendation Systems in Python and JAX +11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition +6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) +13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT + +query IFT +SELECT id, score(), title FROM books WHERE query('title:+设计 -实现') ORDER BY score() DESC +---- +10 2.0477252 前端架构设计 +8 1.8123543 Rust程序设计(第2版) + +query IFT +SELECT id, score(), title FROM books WHERE query('title:+设计 实现') ORDER BY score() DESC +---- +9 5.063791 Vue.js设计与实现 +10 2.0477252 前端架构设计 +8 1.8123543 Rust程序设计(第2版) + +query IFT +SELECT id, score(), title FROM books WHERE query('title:python^5 description:chatgpt^2.1') ORDER BY score() DESC +---- +13 7.890149 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +2 7.1890326 Python深度学习(第2版) +14 5.509352 Building Recommendation Systems in Python and JAX +11 5.263399 OpenAI GPT For Python Developers, 2nd Edition +6 4.8319726 Flask Web开发:基于Python的Web应用开发实战(第2版) +1 4.732555 这就是ChatGPT +12 4.325484 Developing Apps with GPT-4 and ChatGPT +3 3.106897 大模型应用开发极简入门 + +query IFT +SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC +---- +9 25.318954 Vue.js设计与实现 +4 22.395063 白话深度学习的数学 +10 10.238626 前端架构设计 +8 9.061771 Rust程序设计(第2版) + statement ok use default statement ok drop database test_index +