diff --git a/Cargo.lock b/Cargo.lock index d1eb1005a7abc..9235ef0054a02 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1683,7 +1683,7 @@ dependencies = [ "bitflags 2.9.0", "cexpr", "clang-sys", - "itertools 0.11.0", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -9087,7 +9087,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core 0.57.0", + "windows-core 0.61.0", ] [[package]] @@ -11933,7 +11933,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.9.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "stable_deref_trait", ] @@ -12845,7 +12845,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck 0.5.0", - "itertools 0.11.0", + "itertools 0.14.0", "log", "multimap", "once_cell", @@ -12865,7 +12865,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.11.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.106", @@ -15767,8 +15767,8 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" -version = "0.25.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +version = "0.26.0" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "aho-corasick", "arc-swap", @@ -15812,6 +15812,7 @@ dependencies = [ "tempfile", "thiserror 2.0.12", "time", + "typetag", "uuid", "winapi", ] @@ -15819,7 +15820,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.9.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "bitpacking 0.9.2", ] @@ -15827,7 +15828,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.6.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "downcast-rs", "fastdivide", @@ -15842,7 +15843,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.10.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "async-trait", "byteorder", @@ -15865,7 +15866,7 @@ dependencies = [ [[package]] name = "tantivy-jieba" version = "0.17.0" -source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=ac27464#ac27464d5d2f35320b83cd7cb66df68052d9bc18" +source = "git+https://github.com/b41sh/tantivy-jieba?rev=0d18ff7872af581fb799c07f32cbad84ac5993f7#0d18ff7872af581fb799c07f32cbad84ac5993f7" dependencies = [ "jieba-rs", "lazy_static", @@ -15875,7 +15876,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.25.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "fnv", "nom 7.1.3", @@ -15887,7 +15888,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.6.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "futures-util", "itertools 0.14.0", @@ -15900,17 +15901,16 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.6.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "murmurhash32", - "rand_distr", "tantivy-common", ] [[package]] name = "tantivy-tokenizer-api" version = "0.6.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" +source = "git+https://github.com/b41sh/tantivy?rev=7cec26400695dc9d8299751c3b36e8b9eee6abbc#7cec26400695dc9d8299751c3b36e8b9eee6abbc" dependencies = [ "serde", ] @@ -16770,9 +16770,9 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "typetag" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f22b40dd7bfe8c14230cf9702081366421890435b2d625fa92b4acc4c3de6f" +checksum = "be2212c8a9b9bcfca32024de14998494cf9a5dfa59ea1b829de98bac374b86bf" dependencies = [ "erased-serde", "inventory", @@ -16783,9 +16783,9 @@ dependencies = [ [[package]] name = "typetag-impl" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952" +checksum = "27a7a9b72ba121f6f1f6c3632b85604cac41aedb5ddc70accbebb6cac83de846" dependencies = [ "proc-macro2", "quote", @@ -17820,7 +17820,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index e16e8641180a5..6cff91c22e2ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -485,7 +485,7 @@ strum = "0.24.1" sub-cache = "0.2.1" sys-info = "0.9" sysinfo = "0.34.2" -tantivy = "0.25.0" +tantivy = "0.26.0" tantivy-common = "0.10.0" tantivy-fst = "0.5" tantivy-jieba = "0.17.0" @@ -651,9 +651,9 @@ recursive = { git = "https://github.com/datafuse-extras/recursive.git", rev = "1 sled = { git = "https://github.com/datafuse-extras/sled", tag = "v0.34.7-datafuse.1" } state-machine-api = { git = "https://github.com/databendlabs/state-machine-api.git", tag = "v0.3.4" } sub-cache = { git = "https://github.com/databendlabs/sub-cache", tag = "v0.2.1" } -tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "9065a4d" } -tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "9065a4d", package = "tantivy-common" } -tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "ac27464" } -tantivy-query-grammar = { git = "https://github.com/datafuse-extras/tantivy", rev = "9065a4d", package = "tantivy-query-grammar" } +tantivy = { git = "https://github.com/b41sh/tantivy", rev = "7cec26400695dc9d8299751c3b36e8b9eee6abbc" } +tantivy-common = { git = "https://github.com/b41sh/tantivy", rev = "7cec26400695dc9d8299751c3b36e8b9eee6abbc", package = "tantivy-common" } +tantivy-jieba = { git = "https://github.com/b41sh/tantivy-jieba", rev = "0d18ff7872af581fb799c07f32cbad84ac5993f7" } +tantivy-query-grammar = { git = "https://github.com/b41sh/tantivy", rev = "7cec26400695dc9d8299751c3b36e8b9eee6abbc", package = "tantivy-query-grammar" } watcher = { git = "https://github.com/databendlabs/watcher", tag = "v0.4.2" } xorfilter-rs = { git = "https://github.com/datafuse-extras/xorfilter", tag = "databend-alpha.4" } diff --git a/src/query/service/tests/it/storages/testdata/columns_table.txt b/src/query/service/tests/it/storages/testdata/columns_table.txt index c146627997f53..1a73e32318f6e 100644 --- a/src/query/service/tests/it/storages/testdata/columns_table.txt +++ b/src/query/service/tests/it/storages/testdata/columns_table.txt @@ -500,3 +500,5 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'webhook_options' | 'system' | 'notifications' | 'Nullable(Variant)' | 'VARIANT' | '' | '' | 'YES' | '' | | 'workload_groups' | 'system' | 'users' | 'Nullable(String)' | 'VARCHAR' | '' | '' | 'YES' | '' | +-----------------------------------+----------------------+---------------------------+-----------------------+---------------------+----------+----------+----------+----------+ + + diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs index ef0514dd1bc9d..53165255f575f 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs @@ -175,7 +175,7 @@ impl InvertedIndexReader { let (matched_rows, matched_scores) = if self.has_score { let collector = TopDocs::with_limit(self.row_count as usize); - let docs = searcher.search(&query, &collector)?; + let docs = searcher.search(&query, &collector.order_by_score())?; let mut matched_rows = Vec::with_capacity(docs.len()); let mut matched_scores = Vec::with_capacity(docs.len()); @@ -436,7 +436,7 @@ impl InvertedIndexReader { block_postings_map.insert(term_id, block_postings); } else if slice_name.starts_with("pos") { let term_id = id; - let position_reader = PositionReader::open(slice_data)?; + let position_reader = PositionReader::open(slice_data, None)?; position_reader_map.insert(term_id, position_reader); } else if slice_name.starts_with("fieldnorm") { let field_id = id as u32; diff --git a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs index ad97f760b8eb1..4f12d09b61351 100644 --- a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs +++ b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use databend_common_catalog::plan::InvertedIndexInfo; use databend_common_catalog::plan::PushDownInfo; use databend_common_exception::Result; +use databend_common_expression::types::DataType; use databend_common_expression::types::F32; use databend_storages_common_io::ReadSettings; use opendal::Operator; @@ -95,6 +96,14 @@ impl InvertedIndexPruner { need_position = true; } }); + for field_id in &field_ids { + let field = inverted_index_info.index_schema.field(*field_id as usize); + let data_type = field.data_type().remove_nullable(); + if data_type == DataType::Variant { + need_position = true; + break; + } + } // whether need to generate score internl column let has_score = inverted_index_info.has_score; diff --git a/tests/sqllogictests/suites/query/index/04_inverted_index/04_0000_inverted_index_base.test b/tests/sqllogictests/suites/query/index/04_inverted_index/04_0000_inverted_index_base.test index 56275d246e699..fa7cb64534e8d 100644 --- a/tests/sqllogictests/suites/query/index/04_inverted_index/04_0000_inverted_index_base.test +++ b/tests/sqllogictests/suites/query/index/04_inverted_index/04_0000_inverted_index_base.test @@ -500,6 +500,47 @@ query IT select * from t2 where query('body:test'); ---- + +statement ok +CREATE TABLE t3 (id int, body variant, INVERTED INDEX idx (body)) + +statement ok +INSERT INTO t3 VALUES +(1, '{"videoInfo":{"extraData":[{ "name": "codecA", "type": "mp4" },{ "name": "codecB", "type": "jpg" }]}}'), +(2, '{"videoInfo":{"extraData":[{ "name": "codecA", "type": "jpg" },{ "name": "codecA", "type": "mp4" }]}}'), +(3, '{"videoInfo":{"extraData":[{ "name": "codecA", "type": "jpg" },{ "name": "codecB", "type": "mp4" }]}}'), +(4, '{"videoInfo":{"extraData":[{ "name": "codecA", "attributes": { "type": "jpg" }}, { "name": "codecB", "attributes": { "type": "mp4" }}]}}'), +(5, '{"videoInfo":{"extraData":[{ "name": "codecA", "attributes": { "type": "mp4" }}, { "name": "codecB", "attributes": { "type": "jpg" }}]}}'), +(6, '{"videoInfo":{"extraData":[{ "name": "codec foo", "type": "jpg" }, { "name": "codec bar", "type": "mp4" }]}}'), +(7, '{"videoInfo":{"extraData":[{ "name": "codec foo", "type": "mp4" }]}}'); + +query IT +select * from t3 where query('body.videoInfo.extraData.name:codecA AND body.videoInfo.extraData.type:jpg'); +---- +2 {"videoInfo":{"extraData":[{"name":"codecA","type":"jpg"},{"name":"codecA","type":"mp4"}]}} +3 {"videoInfo":{"extraData":[{"name":"codecA","type":"jpg"},{"name":"codecB","type":"mp4"}]}} + +query IT +select * from t3 where query('body.videoInfo.extraData.name:codecA AND body.videoInfo.extraData.type:mp4'); +---- +1 {"videoInfo":{"extraData":[{"name":"codecA","type":"mp4"},{"name":"codecB","type":"jpg"}]}} +2 {"videoInfo":{"extraData":[{"name":"codecA","type":"jpg"},{"name":"codecA","type":"mp4"}]}} + +query IT +select * from t3 where query('body.videoInfo.extraData.name:codecA AND body.videoInfo.extraData.attributes.type:jpg'); +---- +4 {"videoInfo":{"extraData":[{"attributes":{"type":"jpg"},"name":"codecA"},{"attributes":{"type":"mp4"},"name":"codecB"}]}} + +query IT +select * from t3 where query('body.videoInfo.extraData.name:"codec foo" AND body.videoInfo.extraData.type:mp4'); +---- +7 {"videoInfo":{"extraData":[{"name":"codec foo","type":"mp4"}]}} + +query IT +select * from t3 where query('body.videoInfo.extraData.name:codecB AND body.videoInfo.extraData.type:jpg'); +---- +1 {"videoInfo":{"extraData":[{"name":"codecA","type":"mp4"},{"name":"codecB","type":"jpg"}]}} + statement ok CREATE TABLE t_native (id int, content string, INVERTED INDEX idx1 (content)) storage_format = 'native' row_per_page = 2; @@ -528,3 +569,4 @@ use default statement ok drop database test_inverted_index +