Skip to content

Commit

Permalink
feat(query): inverted index support json type (databendlabs#15267)
Browse files Browse the repository at this point in the history
  • Loading branch information
b41sh authored Apr 26, 2024
1 parent 88331f0 commit 8e67459
Show file tree
Hide file tree
Showing 10 changed files with 243 additions and 32 deletions.
23 changes: 13 additions & 10 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ members = [

[workspace.dependencies]
# databend maintains
jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "a7325f4" }
jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "3fe3acd" }

opendal = { version = "0.45.1", features = [
"layers-minitrace",
Expand Down
1 change: 1 addition & 0 deletions src/query/ee/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ log = { workspace = true }
tempfile = "3.4.0"

[dev-dependencies]
jsonb = { workspace = true }

[build-dependencies]
databend-common-building = { path = "../../common/building" }
139 changes: 138 additions & 1 deletion src/query/ee/tests/it/inverted_index/pruning.rs

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions src/query/functions/src/scalars/geometry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,6 @@ pub fn register(registry: &mut FunctionRegistry) {
builder.len(),
ErrorCode::GeometryError(e.to_string()).to_string(),
);
return;
}
};
builder.commit_row();
Expand Down Expand Up @@ -531,7 +530,6 @@ pub fn register(registry: &mut FunctionRegistry) {
builder.len(),
ErrorCode::GeometryError(e.to_string()).to_string(),
);
return;
}
};
builder.commit_row();
Expand Down
6 changes: 4 additions & 2 deletions src/query/sql/src/planner/binder/ddl/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,9 +384,11 @@ impl Binder {
for column in columns {
match table_schema.field_with_name(&column.name) {
Ok(field) => {
if field.data_type.remove_nullable() != TableDataType::String {
if field.data_type.remove_nullable() != TableDataType::String
&& field.data_type.remove_nullable() != TableDataType::Variant
{
return Err(ErrorCode::UnsupportedIndex(format!(
"Inverted index currently only support String type, but the type of column {} is {}",
"Inverted index currently only support String and variant type, but the type of column {} is {}",
column, field.data_type
)));
}
Expand Down
8 changes: 7 additions & 1 deletion src/query/sql/src/planner/semantic/type_check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2234,14 +2234,20 @@ impl<'a> TypeChecker<'a> {
continue;
}
let field_names: Vec<&str> = field_str.split(':').collect();
// if the field is JSON type, must specify the key path in the object
// for example:
// the field `info` has the value: `{"tags":{"id":10,"env":"prod","name":"test"}}`
// a query can be written like this `info.tags.env:prod`
let field_name = field_names[0].trim();
let sub_field_names: Vec<&str> = field_name.split('.').collect();
let column_expr = Expr::ColumnRef {
span: query_scalar.span(),
column: ColumnRef {
database: None,
table: None,
column: ColumnID::Name(Identifier::from_name(
query_scalar.span(),
field_names[0].trim(),
sub_field_names[0].trim(),
)),
},
};
Expand Down
55 changes: 42 additions & 13 deletions src/query/storages/fuse/src/io/write/inverted_index_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,11 @@ use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE;
use opendal::Operator;
use serde::Deserialize;
use serde::Serialize;
use serde_json::Map;
use tantivy::schema::Document;
use tantivy::schema::Field;
use tantivy::schema::IndexRecordOption;
use tantivy::schema::JsonObjectOptions;
use tantivy::schema::Schema;
use tantivy::schema::TextFieldIndexing;
use tantivy::schema::TextOptions;
Expand All @@ -63,6 +65,7 @@ use tantivy::tokenizer::StopWordFilter;
use tantivy::tokenizer::TextAnalyzer;
use tantivy::tokenizer::TokenizerManager;
use tantivy::Directory;
// use tantivy::schema::document::OwnedValue;
use tantivy::Index;
use tantivy::IndexBuilder;
use tantivy::IndexSettings;
Expand Down Expand Up @@ -163,18 +166,26 @@ impl InvertedIndexWriter {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer(&tokenizer_name)
.set_index_option(index_record);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing.clone());
let json_options = JsonObjectOptions::default().set_indexing_options(text_field_indexing);

let mut schema_builder = Schema::builder();
let mut index_fields = Vec::with_capacity(schema.fields.len());
for field in &schema.fields {
if field.data_type().remove_nullable() != DataType::String {
return Err(ErrorCode::IllegalDataType(format!(
"inverted index only support String type, but got {}",
field.data_type()
)));
}
let index_field = schema_builder.add_text_field(field.name(), text_options.clone());
let index_field = match field.data_type().remove_nullable() {
DataType::String => {
schema_builder.add_text_field(field.name(), text_options.clone())
}
DataType::Variant => {
schema_builder.add_json_field(field.name(), json_options.clone())
}
_ => {
return Err(ErrorCode::IllegalDataType(format!(
"inverted index only support String and Variant type, but got {}",
field.data_type()
)));
}
};
index_fields.push(index_field);
}
let index_schema = schema_builder.build();
Expand Down Expand Up @@ -221,15 +232,33 @@ impl InvertedIndexWriter {
}
}

let mut types = Vec::with_capacity(self.schema.num_fields());
for field in self.schema.fields() {
let ty = field.data_type().remove_nullable();
types.push(ty);
}
for i in 0..block.num_rows() {
let mut doc = Document::new();
for j in 0..block.num_columns() {
for (j, typ) in types.iter().enumerate() {
let field = Field::from_field_id(j as u32);
let column = block.get_by_offset(j);
if let ScalarRef::String(text) = unsafe { column.value.index_unchecked(i) } {
doc.add_text(field, text);
} else {
doc.add_text(field, "");
match unsafe { column.value.index_unchecked(i) } {
ScalarRef::String(text) => doc.add_text(field, text),
ScalarRef::Variant(jsonb_val) => {
// only support object JSON, other JSON type will not add index.
if let Ok(Some(obj_val)) = jsonb::to_serde_json_object(jsonb_val) {
doc.add_json_object(field, obj_val);
} else {
doc.add_json_object(field, Map::new());
}
}
_ => {
if typ == &DataType::Variant {
doc.add_json_object(field, Map::new());
} else {
doc.add_text(field, "");
}
}
}
}
self.operations.push(UserOperation::Add(doc));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,41 @@ onlyif mysql
statement error 1105
SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC

statement ok
CREATE TABLE t1 (id int, body json)

statement ok
CREATE INVERTED INDEX IF NOT EXISTS idx ON t1(body) tokenizer = 'chinese'

statement ok
INSERT INTO t1 VALUES
(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"]}}'),
(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]}}'),
(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]}}'),
(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"]}}'),
(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]}}'),
(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]}}'),
(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"]}}'),
(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]}}'),
(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]}}'),
(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]}}')

query IFT
SELECT id, score(), body FROM t1 WHERE query('body.title:energy')
----
2 3.2352333 {"metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"}

query IFT
SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:technology')
----
3 2.4057739 {"metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"}
5 2.4057739 {"metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"}

query IFT
SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:技术')
----
6 2.4057739 {"metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"}
10 2.4057739 {"metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"}

statement ok
use default
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
8 18446744073709551615
9 12.34
10 -56.78
11 140000
11 140000.0
12 0.0099
13 "abcd"
14 "test"
Expand All @@ -28,7 +28,7 @@
28 18446744073709551615
29 12.34
30 -56.78
31 140000
31 140000.0
32 0.0099
33 "abcd"
34 "test"
Expand Down

0 comments on commit 8e67459

Please sign in to comment.