From 8a5f4a39cd3f0a44797e6301961b12c7bf84c21b Mon Sep 17 00:00:00 2001 From: "bors[bot]" <26634292+bors[bot]@users.noreply.github.com> Date: Wed, 30 Nov 2022 16:51:48 +0000 Subject: [PATCH 1/4] Merge #712 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 712: Fix bulk facet indexing bug r=Kerollmops a=loiclec # Pull Request ## Related issue Fixes (partially, until merged into meilisearch) https://github.com/meilisearch/meilisearch/issues/3165 ## What does this PR do? Fixes a bug where indexing certain numbers of filterable attribute values in bulk led to corrupted facet databases. This was due to a lossy integer conversion which would ultimately prevent entire levels of the facet database to be written into LMDB. More specifically, this change was made: ```diff - if cur_writer_len as u8 >= self.min_level_size { + if cur_writer_len >= self.min_level_size as usize { ``` I also checked other comparisons to `min_level_size` and other conversions such as `x as u8` in this part of the codebase. Co-authored-by: Loïc Lecrenier --- milli/src/update/facet/bulk.rs | 52 +++++++++++++++++++++++++-- milli/src/update/facet/incremental.rs | 2 ++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 30660d5af..b1065c0bc 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -288,6 +288,8 @@ impl FacetsUpdateBulkInner { for bitmap in sub_bitmaps { combined_bitmap |= bitmap; } + // The conversion of sub_bitmaps.len() to a u8 will always be correct + // since its length is bounded by max_group_size, which is a u8. group_sizes.push(sub_bitmaps.len() as u8); left_bounds.push(left_bound); @@ -340,7 +342,7 @@ impl FacetsUpdateBulkInner { } } // if we inserted enough elements to reach the minimum level size, then we push the writer - if cur_writer_len as u8 >= self.min_level_size { + if cur_writer_len >= self.min_level_size as usize { sub_writers.push(writer_into_reader(cur_writer)?); } else { // otherwise, if there are still leftover elements, we give them to the level above @@ -357,11 +359,15 @@ impl FacetsUpdateBulkInner { mod tests { use std::iter::once; + use big_s::S; + use maplit::hashset; use roaring::RoaringBitmap; + use crate::documents::documents_batch_reader_from_objects; use crate::heed_codec::facet::OrderedF64Codec; - use crate::milli_snap; + use crate::index::tests::TempIndex; use crate::update::facet::tests::FacetIndex; + use crate::{db_snap, milli_snap}; #[test] fn insert() { @@ -443,4 +449,46 @@ mod tests { test("large_group_small_min_level", 16, 2); test("odd_group_odd_min_level", 7, 3); } + + #[test] + fn bug_3165() { + // Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies) + // would lead to a facet DB which was missing some levels. + // That was because before writing a level into the database, we would + // check that its size was higher than the minimum level size using + // a lossy integer conversion: `level_size as u8 >= min_level_size`. + // + // This missing level in the facet DBs would make the incremental indexer + // (and other search algorithms) crash. + // + // https://github.com/meilisearch/meilisearch/issues/3165 + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("id") }); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..=22_540 { + documents.push( + serde_json::json! { + { + "id": i as u64, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); + db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); + } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index ddf55b06c..223d4fc63 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -436,6 +436,8 @@ impl FacetsUpdateIncrementalInner { level: highest_level + 1, left_bound: first_key.unwrap().left_bound, }; + // Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size` + // when it is created above. let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; to_add.push((key.into_owned(), value)); } From 58713e2a365dabbe3887a1ece6128682c127f448 Mon Sep 17 00:00:00 2001 From: "bors[bot]" <26634292+bors[bot]@users.noreply.github.com> Date: Mon, 5 Dec 2022 19:57:08 +0000 Subject: [PATCH 2/4] Merge #722 722: Geosearch for zero radius r=irevoire a=amab8901 # Pull Request ## Related issue Fixes #3167 (https://github.com/meilisearch/meilisearch/issues/3167) ## What does this PR do? - allows Geosearch with zero radius to return the specified location when the coordinates match perfectly (instead of returning nothing). See link for more details. - new attempt on https://github.com/meilisearch/milli/pull/713 ## PR checklist Please check if your PR fulfills the following requirements: - [ X ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [ X ] Have you read the contributing guidelines? - [ X ] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: amab8901 Co-authored-by: Tamo --- milli/src/search/facet/filter.rs | 53 ++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 9b87353b0..3842a5f56 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -420,7 +420,8 @@ impl<'a> Filter<'a> { let result = rtree .nearest_neighbor_iter(&xyz_base_point) .take_while(|point| { - distance_between_two_points(&base_point, &point.data.1) < radius + distance_between_two_points(&base_point, &point.data.1) + <= radius + f64::EPSILON }) .map(|point| point.data.0) .collect(); @@ -457,10 +458,9 @@ mod tests { #[test] fn empty_db() { let index = TempIndex::new(); - // Set the filterable fields to be the channel. + //Set the filterable fields to be the channel. index .update_settings(|settings| { - settings.set_searchable_fields(vec![S("PrIcE")]); // to keep the fields order settings.set_filterable_fields(hashset! { S("PrIcE") }); }) .unwrap(); @@ -626,6 +626,52 @@ mod tests { assert_eq!(documents_ids, vec![2]); } + #[test] + fn zero_radius() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("_geo") }); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 1, + "name": "Nàpiz' Milano", + "address": "Viale Vittorio Veneto, 30, 20124, Milan, Italy", + "type": "pizza", + "rating": 9, + "_geo": { + "lat": 45.4777599, + "lng": 9.1967508 + } + }, + { + "id": 2, + "name": "Artico Gelateria Tradizionale", + "address": "Via Dogana, 1, 20123 Milan, Italy", + "type": "ice cream", + "rating": 10, + "_geo": { + "lat": 45.4632046, + "lng": 9.1719421 + } + }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = crate::Search::new(&rtxn, &index); + + search.filter(Filter::from_str("_geoRadius(45.4777599, 9.1967508, 0)").unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + } + #[test] fn geo_radius_error() { let index = TempIndex::new(); @@ -638,6 +684,7 @@ mod tests { .unwrap(); let rtxn = index.read_txn().unwrap(); + // georadius have a bad latitude let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); From 9eeb29238819d2ce54fb0e901e69b3d59d2ef368 Mon Sep 17 00:00:00 2001 From: "bors[bot]" <26634292+bors[bot]@users.noreply.github.com> Date: Mon, 5 Dec 2022 18:26:01 +0000 Subject: [PATCH 3/4] Merge #720 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 720: Make soft deletion optional in document addition and deletion + add lots of tests r=irevoire a=loiclec # Pull Request ## What does this PR do? When debugging recent issues, I created a few unit tests in the hopes reproducing the bugs I was looking for. In the end, I didn't find any, but I thought it would still be good to keep those tests. More importantly, I added a field to the `DeleteDocuments` and `IndexDocuments` builders, called `disable_soft_deletion`. If set to `true`, the indexing/deletion will never add documents to the `soft_deleted_documents_ids` and instead perform a real deletion of the documents from the databases. For the new tests, I have: - Improved the insta-snapshot format of the `external_documents_ids` structure - Added more tests for the facet DB indexing, deletion, and search algorithms, making sure to test them when the facet DB contains strings (instead of numbers) as well. - Added more tests for the incremental indexing of the prefix proximity databases. For example, to see if documents are replaced correctly and if common prefixes are deleted correctly. - Added tests that mix soft deletion and hard deletion, including when processing batches of document updates. Co-authored-by: Loïc Lecrenier --- milli/src/index.rs | 547 +++++++++++++++++- .../src/search/facet/facet_sort_ascending.rs | 93 ++- .../src/search/facet/facet_sort_descending.rs | 97 +++- milli/src/search/facet/mod.rs | 42 +- .../0.snap | 0 .../1.snap | 0 .../0-0.snap | 33 ++ .../0-1.snap | 33 ++ .../1-0.snap | 27 + .../1-1.snap | 27 + .../filter_sort_descending/2.snap | 60 ++ .../0-0.snap | 33 ++ .../0-1.snap | 33 ++ .../1-0.snap | 27 + .../1-1.snap | 27 + milli/src/snapshot_tests.rs | 23 +- milli/src/update/delete_documents.rs | 16 +- milli/src/update/facet/bulk.rs | 36 +- milli/src/update/facet/delete.rs | 131 ++++- milli/src/update/facet/incremental.rs | 4 +- milli/src/update/facet/mod.rs | 113 +++- .../bulk.rs/insert_string/default.hash.snap | 4 + .../large_group_small_min_level.hash.snap | 4 + .../odd_group_odd_min_level.hash.snap | 4 + .../small_group_large_min_level.hash.snap | 4 + .../small_group_small_min_level.hash.snap | 4 + .../1/facet_id_f64_docids.hash.snap | 4 - .../1/number_faceted_documents_ids.hash.snap | 4 - .../2/facet_id_f64_docids.hash.snap | 4 - .../2/number_faceted_documents_ids.hash.snap | 4 - milli/src/update/index_documents/mod.rs | 7 + .../initial/word_docids.snap | 54 ++ .../updated/soft_deleted_documents_ids.snap | 4 + .../updated/word_docids.snap | 58 ++ milli/src/update/prefix_word_pairs/mod.rs | 315 +++++++++- .../prefix_word_pair_proximity_docids.snap | 20 + .../word_prefix_pair_proximity_docids.snap | 0 .../prefix_word_pair_proximity_docids.snap | 0 .../update/word_pair_proximity_docids.snap | 0 .../word_prefix_pair_proximity_docids.snap | 0 .../prefix_word_pair_proximity_docids.snap | 0 .../word_pair_proximity_docids.snap | 0 .../word_prefix_pair_proximity_docids.snap | 0 .../first_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 6 + .../first_delete/word_docids.snap | 60 ++ .../word_prefix_pair_proximity_docids.snap | 11 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../initial/word_docids.snap | 65 +++ .../word_prefix_pair_proximity_docids.snap | 16 + .../reupdate/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 6 + .../reupdate/word_docids.snap | 60 ++ .../word_prefix_pair_proximity_docids.snap | 5 + .../second_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 6 + .../second_delete/word_docids.snap | 10 + .../word_prefix_pair_proximity_docids.snap | 11 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 9 + .../initial/word_docids.snap | 61 ++ .../word_prefix_pair_proximity_docids.snap | 7 + .../replaced/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 5 + .../replaced/word_docids.snap | 61 ++ .../word_prefix_pair_proximity_docids.snap | 5 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 9 + .../initial/word_docids.snap | 61 ++ .../word_prefix_pair_proximity_docids.snap | 7 + .../replaced/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 10 + .../replaced/word_docids.hash.snap | 4 + .../word_prefix_pair_proximity_docids.snap | 8 + .../first_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../first_delete/word_docids.snap | 65 +++ .../word_prefix_pair_proximity_docids.snap | 16 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../initial/word_docids.snap | 65 +++ .../word_prefix_pair_proximity_docids.snap | 16 + .../reupdate/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 17 + .../reupdate/word_docids.hash.snap | 4 + .../word_prefix_pair_proximity_docids.snap | 21 + .../second_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../second_delete/word_docids.snap | 65 +++ .../word_prefix_pair_proximity_docids.snap | 16 + 91 files changed, 2724 insertions(+), 64 deletions(-) rename milli/src/search/facet/snapshots/facet_sort_ascending.rs/{filter_sort => filter_sort_ascending}/0.snap (100%) rename milli/src/search/facet/snapshots/facet_sort_ascending.rs/{filter_sort => filter_sort_ascending}/1.snap (100%) create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/initial/word_prefix_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/update/prefix_word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/update/word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/update/word_prefix_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_batch_bug_3043 => batch_bug_3043}/prefix_word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_batch_bug_3043 => batch_bug_3043}/word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_batch_bug_3043 => batch_bug_3043}/word_prefix_pair_proximity_docids.snap (100%) create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/index.rs b/milli/src/index.rs index 5910a305c..9928676f0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1190,8 +1190,10 @@ pub(crate) mod tests { use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; - use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::{db_snap, Index}; + use crate::update::{ + self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, + }; + use crate::{db_snap, obkv_to_json, Index}; pub(crate) struct TempIndex { pub inner: Index, @@ -1477,4 +1479,545 @@ pub(crate) mod tests { let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); } + + #[test] + fn replace_documents_external_ids_and_soft_deletion_check() { + use big_s::S; + use maplit::hashset; + + let mut index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("doggo") }); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..4 { + docs.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + index.add_documents(documents!(docs)).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 0 + 1 1 + 2 2 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, ] + 1 0 2 1 [2, ] + 1 0 3 1 [3, ] + "###); + + let mut docs = vec![]; + for i in 0..3 { + docs.push(serde_json::json!( + { "id": i, "doggo": i + 1 } + )); + } + index.add_documents(documents!(docs)).unwrap(); + + db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]"); + db_snap!(index, facet_id_f64_docids, 2, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, 4, ] + 1 0 2 1 [2, 5, ] + 1 0 3 1 [3, 6, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index + .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) + .unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, 4, ] + 1 0 2 1 [2, 5, ] + 1 0 3 1 [3, 6, ] + 1 0 4 1 [7, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index + .update_settings(|settings| { + settings.set_distinct_field("id".to_owned()); + }) + .unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [7, ] + "###); + + index.index_documents_config.disable_soft_deletion = true; + index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [7, ] + "###); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + let docid = delete.delete_external_id("3").unwrap(); + insta::assert_snapshot!(format!("{docid}"), @"7"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [7, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); + + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 3 0 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [0, 7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 7, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index.add_documents(documents!([{ "id": 3, "doggo": 5 }])).unwrap(); + + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 3 1 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [0, 1, 7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 7, ] + 1 0 5 1 [1, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index.add_documents(documents!([{ "id": 3, "doggo": 5, "id": 2, "doggo": 4 }])).unwrap(); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 0 4 + 1 5 + 2 2 + 3 1 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [2, 6, ] + 0 0 3 1 [0, 1, 7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 2, 7, ] + 1 0 5 1 [1, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index + .add_documents(documents!([{ "id": 4, "doggo": 5 }, { "id": 3, "doggo": 5 }])) + .unwrap(); + + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 4 3 + hard: + 0 4 + 1 5 + 2 2 + 3 1 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [2, 6, ] + 0 0 3 1 [0, 1, 7, ] + 0 0 4 1 [3, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 2, 7, ] + 1 0 5 1 [1, 3, ] + "###); + } + + #[test] + fn replace_documents_in_batches_external_ids_and_soft_deletion_check() { + use big_s::S; + use maplit::hashset; + + let mut index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("doggo") }); + }) + .unwrap(); + + let add_documents = |index: &TempIndex, docs: Vec>| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new( + &mut wtxn, + index, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + for docs in docs { + (builder, _) = builder.add_documents(documents!(docs)).unwrap(); + } + builder.execute().unwrap(); + wtxn.commit().unwrap(); + }; + // First Batch + { + let mut docs1 = vec![]; + for i in 0..4 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1]); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 0 + 1 1 + 2 2 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, ] + 1 0 2 1 [2, ] + 1 0 3 1 [3, ] + "###); + } + // Second Batch: replace the documents with soft-deletion + { + index.index_documents_config.disable_soft_deletion = false; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+1 } + )); + } + let mut docs2 = vec![]; + for i in 0..3 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, 4, ] + 1 0 1 1 [1, 5, ] + 1 0 2 1 [2, 6, ] + 1 0 3 1 [3, ] + "###); + } + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0]; + + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(1), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(2), + } + "###); + drop(rtxn); + // Third Batch: replace the documents with soft-deletion again + { + index.index_documents_config.disable_soft_deletion = false; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+1 } + )); + } + let mut docs2 = vec![]; + for i in 0..4 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 7 + 1 8 + 2 9 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, 4, 7, ] + 1 0 1 1 [1, 5, 8, ] + 1 0 2 1 [2, 6, 9, ] + 1 0 3 1 [3, ] + "###); + } + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(1), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(2), + } + "###); + drop(rtxn); + + // Fourth Batch: replace the documents without soft-deletion + { + index.index_documents_config.disable_soft_deletion = true; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+2 } + )); + } + let mut docs2 = vec![]; + for i in 0..1 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 10 + 1 11 + 2 12 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [10, ] + 1 0 3 1 [3, 11, ] + 1 0 4 1 [12, ] + "###); + + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(4), + } + "###); + drop(rtxn); + } + } } diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 552795981..32cf5c355 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -80,6 +80,8 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { // that we found all the documents in the sub level iterations already, // we can pop this level iterator. if documents_ids.is_empty() { + // break our of the for loop into the end of the 'outer loop, which + // pops the stack break; } @@ -113,11 +115,14 @@ mod tests { use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; - use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids, + get_simple_index, get_simple_string_index_with_multiple_field_ids, + }; use crate::snapshot_tests::display_bitmap; #[test] - fn filter_sort() { + fn filter_sort_ascending() { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); @@ -134,4 +139,88 @@ mod tests { txn.commit().unwrap(); } } + + #[test] + fn filter_sort_ascending_multiple_field_ids() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-0")); + + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-1")); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_no_candidates() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_inexisting_field_id() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } } diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 12767c64d..4d1fdd1e7 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -125,12 +125,20 @@ mod tests { use crate::heed_codec::ByteSliceRefCodec; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; - use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids, + get_simple_index, get_simple_index_with_multiple_field_ids, + get_simple_string_index_with_multiple_field_ids, + }; use crate::snapshot_tests::display_bitmap; #[test] fn filter_sort_descending() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); @@ -147,4 +155,89 @@ mod tests { txn.commit().unwrap(); } } + + #[test] + fn filter_sort_descending_multiple_field_ids() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-0")); + + let mut results = String::new(); + + let iter = descending_facet_sort(&txn, db, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-1")); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_sort_ascending_with_no_candidates() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_inexisting_field_id() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 7dfdcdb94..73054b84a 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -89,7 +89,8 @@ pub(crate) mod tests { use roaring::RoaringBitmap; use crate::heed_codec::facet::OrderedF64Codec; - use crate::update::facet::tests::FacetIndex; + use crate::heed_codec::StrRefCodec; + use crate::update::facet::test_helpers::FacetIndex; pub fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -147,4 +148,43 @@ pub(crate) mod tests { txn.commit().unwrap(); index } + pub fn get_simple_string_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for fid in 0..2 { + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + if i % 2 == 0 { + index.insert(&mut txn, fid, &format!("{i}").as_str(), &bitmap); + } else { + index.insert(&mut txn, fid, &"", &bitmap); + } + } + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_string_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + for fid in 0..2 { + for (_i, &key) in keys.iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + if key % 2 == 0 { + index.insert(&mut txn, fid, &format!("{key}").as_str(), &bitmap); + } else { + index.insert(&mut txn, fid, &"", &bitmap); + } + } + } + txn.commit().unwrap(); + index + } } diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/0.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap rename to milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/0.snap diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/1.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap rename to milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/1.snap diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap new file mode 100644 index 000000000..ef207f888 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] +[200, ] +[202, ] +[204, ] +[206, ] +[208, ] +[210, ] +[212, ] +[214, ] +[216, ] +[218, ] +[220, ] +[222, ] +[224, ] +[226, ] +[228, ] +[230, ] +[232, ] +[234, ] +[236, ] +[238, ] +[240, ] +[242, ] +[244, ] +[246, ] +[248, ] +[250, ] +[252, ] +[254, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap new file mode 100644 index 000000000..ef207f888 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] +[200, ] +[202, ] +[204, ] +[206, ] +[208, ] +[210, ] +[212, ] +[214, ] +[216, ] +[218, ] +[220, ] +[222, ] +[224, ] +[226, ] +[228, ] +[230, ] +[232, ] +[234, ] +[236, ] +[238, ] +[240, ] +[242, ] +[244, ] +[246, ] +[248, ] +[250, ] +[252, ] +[254, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap new file mode 100644 index 000000000..52d3d0de0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] +[202, ] +[224, ] +[230, ] +[236, ] +[244, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[264, ] +[278, ] +[282, ] +[286, ] +[292, ] +[206, ] +[208, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap new file mode 100644 index 000000000..52d3d0de0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] +[202, ] +[224, ] +[230, ] +[236, ] +[244, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[264, ] +[278, ] +[282, ] +[286, ] +[292, ] +[206, ] +[208, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap new file mode 100644 index 000000000..032763c74 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[255, ] +[254, ] +[253, ] +[252, ] +[251, ] +[250, ] +[249, ] +[248, ] +[247, ] +[246, ] +[245, ] +[244, ] +[243, ] +[242, ] +[241, ] +[240, ] +[239, ] +[238, ] +[237, ] +[236, ] +[235, ] +[234, ] +[233, ] +[232, ] +[231, ] +[230, ] +[229, ] +[228, ] +[227, ] +[226, ] +[225, ] +[224, ] +[223, ] +[222, ] +[221, ] +[220, ] +[219, ] +[218, ] +[217, ] +[216, ] +[215, ] +[214, ] +[213, ] +[212, ] +[211, ] +[210, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[204, ] +[203, ] +[202, ] +[201, ] +[200, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap new file mode 100644 index 000000000..b833cae97 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[254, ] +[252, ] +[250, ] +[248, ] +[246, ] +[244, ] +[242, ] +[240, ] +[238, ] +[236, ] +[234, ] +[232, ] +[230, ] +[228, ] +[226, ] +[224, ] +[222, ] +[220, ] +[218, ] +[216, ] +[214, ] +[212, ] +[210, ] +[208, ] +[206, ] +[204, ] +[202, ] +[200, ] +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap new file mode 100644 index 000000000..b833cae97 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[254, ] +[252, ] +[250, ] +[248, ] +[246, ] +[244, ] +[242, ] +[240, ] +[238, ] +[236, ] +[234, ] +[232, ] +[230, ] +[228, ] +[226, ] +[224, ] +[222, ] +[220, ] +[218, ] +[216, ] +[214, ] +[212, ] +[210, ] +[208, ] +[206, ] +[204, ] +[202, ] +[200, ] +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap new file mode 100644 index 000000000..2623a8807 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[238, ] +[236, ] +[226, ] +[220, ] +[216, ] +[210, ] +[208, ] +[206, ] +[292, ] +[286, ] +[282, ] +[278, ] +[264, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[244, ] +[230, ] +[224, ] +[202, ] +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap new file mode 100644 index 000000000..2623a8807 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[238, ] +[236, ] +[226, ] +[220, ] +[216, ] +[210, ] +[208, ] +[206, ] +[292, ] +[286, ] +[282, ] +[278, ] +[264, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[244, ] +[230, ] +[224, ] +[202, ] +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] + diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 46972deba..9ad5fe425 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -356,19 +356,22 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + let mut snap = String::new(); - let soft_bytes = soft.into_fst().as_bytes().to_owned(); - let mut hex_soft = String::new(); - for byte in soft_bytes { - write!(&mut hex_soft, "{:x}", byte).unwrap(); + + writeln!(&mut snap, "soft:").unwrap(); + let stream_soft = soft.stream(); + let soft_external_ids = stream_soft.into_str_vec().unwrap(); + for (key, id) in soft_external_ids { + writeln!(&mut snap, "{key:<24} {id}").unwrap(); } - writeln!(&mut snap, "soft: {hex_soft}").unwrap(); - let hard_bytes = hard.into_fst().as_bytes().to_owned(); - let mut hex_hard = String::new(); - for byte in hard_bytes { - write!(&mut hex_hard, "{:x}", byte).unwrap(); + writeln!(&mut snap, "hard:").unwrap(); + let stream_hard = hard.stream(); + let hard_external_ids = stream_hard.into_str_vec().unwrap(); + for (key, id) in hard_external_ids { + writeln!(&mut snap, "{key:<24} {id}").unwrap(); } - writeln!(&mut snap, "hard: {hex_hard}").unwrap(); + snap } pub fn snap_number_faceted_documents_ids(index: &Index) -> String { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index a6a4ea609..88ec78420 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -26,7 +26,6 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, - #[cfg(test)] disable_soft_deletion: bool, } @@ -48,12 +47,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, to_delete_docids: RoaringBitmap::new(), - #[cfg(test)] disable_soft_deletion: false, }) } - #[cfg(test)] pub fn disable_soft_deletion(&mut self, disable: bool) { self.disable_soft_deletion = disable; } @@ -156,17 +153,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We run the deletion. // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents // We run the deletion. - let disable_soft_deletion = { - #[cfg(not(test))] - { - false - } - #[cfg(test)] - { - self.disable_soft_deletion - } - }; - if !disable_soft_deletion + + if !self.disable_soft_deletion && percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index b1065c0bc..30f15ebab 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -365,8 +365,9 @@ mod tests { use crate::documents::documents_batch_reader_from_objects; use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::StrRefCodec; use crate::index::tests::TempIndex; - use crate::update::facet::tests::FacetIndex; + use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; use crate::{db_snap, milli_snap}; #[test] @@ -491,4 +492,37 @@ mod tests { db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); } + + #[test] + fn insert_string() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + + let strings = (0..1_000).map(|i| ordered_string(i as usize)).collect::>(); + let mut elements = Vec::<((u16, &str), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, &strings[i as usize]), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, &strings[i as usize]), once(i).collect())); + } + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); + }; + + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); + } } diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index 9bec2d911..4030f10da 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -114,11 +114,14 @@ mod tests { use big_s::S; use maplit::hashset; + use rand::seq::SliceRandom; + use rand::SeedableRng; use roaring::RoaringBitmap; use crate::db_snap; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::update::facet::test_helpers::ordered_string; use crate::update::DeleteDocuments; #[test] @@ -156,8 +159,8 @@ mod tests { let documents = documents_batch_reader_from_objects(documents); index.add_documents(documents).unwrap(); - db_snap!(index, facet_id_f64_docids, 1); - db_snap!(index, number_faceted_documents_ids, 1); + db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); + db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); let mut wtxn = index.env.write_txn().unwrap(); @@ -174,8 +177,126 @@ mod tests { wtxn.commit().unwrap(); db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_f64_docids, 2); - db_snap!(index, number_faceted_documents_ids, 2); + db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); + db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); + } + + // Same test as above but working with string values for the facets + #[test] + fn delete_mixed_incremental_and_bulk_string() { + // The point of this test is to create an index populated with documents + // containing different filterable attributes. Then, we delete a bunch of documents + // such that a mix of the incremental and bulk indexer is used (depending on the field id) + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": ordered_string(i / 10), + "colour": ordered_string(i / 100), + "timestamp": ordered_string(i / 2), + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) + db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); + db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); + + let mut wtxn = index.env.write_txn().unwrap(); + + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_documents(&RoaringBitmap::from_iter(0..100)); + // by deleting the first 100 documents, we expect that: + // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) + // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 + // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 + // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 + // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); + db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); + } + + #[test] + fn delete_almost_all_incrementally_string() { + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": ordered_string(i / 10), + "colour": ordered_string(i / 100), + "timestamp": ordered_string(i / 2), + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) + db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); + db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + + let mut docids_to_delete = (0..1000).collect::>(); + docids_to_delete.shuffle(&mut rng); + for docid in docids_to_delete.into_iter().take(990) { + let mut wtxn = index.env.write_txn().unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_documents(&RoaringBitmap::from_iter([docid])); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + } + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); + db_snap!(index, string_faceted_documents_ids, 2, @r###" + 0 [] + 1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] + 2 [292, 324, 358, 381, 493, 839, 852, ] + 3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] + "###); } } @@ -188,7 +309,7 @@ mod comparison_bench { use roaring::RoaringBitmap; use crate::heed_codec::facet::OrderedF64Codec; - use crate::update::facet::tests::FacetIndex; + use crate::update::facet::test_helpers::FacetIndex; // This is a simple test to get an intuition on the relative speed // of the incremental vs. bulk indexer. diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 223d4fc63..cffce5525 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -648,7 +648,7 @@ mod tests { use crate::heed_codec::facet::OrderedF64Codec; use crate::heed_codec::StrRefCodec; use crate::milli_snap; - use crate::update::facet::tests::FacetIndex; + use crate::update::facet::test_helpers::FacetIndex; #[test] fn append() { @@ -1053,7 +1053,7 @@ mod fuzz { use tempfile::TempDir; use super::*; - use crate::update::facet::tests::FacetIndex; + use crate::update::facet::test_helpers::FacetIndex; #[derive(Default)] pub struct TrivialDatabase { pub elements: BTreeMap>, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 76e5514a1..fd55204c3 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -162,7 +162,7 @@ impl<'i> FacetsUpdate<'i> { } #[cfg(test)] -pub(crate) mod tests { +pub(crate) mod test_helpers { use std::cell::Cell; use std::fmt::Display; use std::iter::FromIterator; @@ -183,6 +183,23 @@ pub(crate) mod tests { use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; + /// Utility function to generate a string whose position in a lexicographically + /// ordered list is `i`. + pub fn ordered_string(mut i: usize) -> String { + // The first string is empty + if i == 0 { + return String::new(); + } + // The others are 5 char long, each between 'a' and 'z' + let mut s = String::new(); + for _ in 0..5 { + let (digit, next) = (i % 26, i / 26); + s.insert(0, char::from_u32('a' as u32 + digit as u32).unwrap()); + i = next; + } + s + } + /// A dummy index that only contains the facet database, used for testing pub struct FacetIndex where @@ -438,6 +455,98 @@ pub(crate) mod tests { } } +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + + #[test] + fn replace_all_identical_soft_deletion_then_hard_deletion() { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("size") }); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); + db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); + db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); + + let mut documents = vec![]; + for i in 0..999 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + "other": 0, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); + db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); + db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); + + // Then replace the last document while disabling soft_deletion + index.index_documents_config.disable_soft_deletion = true; + let mut documents = vec![]; + for i in 999..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + "other": 0, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); + db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); + db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); + } +} + #[allow(unused)] #[cfg(test)] mod comparison_bench { @@ -446,7 +555,7 @@ mod comparison_bench { use rand::Rng; use roaring::RoaringBitmap; - use super::tests::FacetIndex; + use super::test_helpers::FacetIndex; use crate::heed_codec::facet::OrderedF64Codec; // This is a simple test to get an intuition on the relative speed diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap new file mode 100644 index 000000000..b7705b72e --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +353d70f52eea66e5031dca989ea8a037 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..15030a1ea --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +52a093c909133d84023a4a7b83864808 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..949ec6647 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9d86c72ddb241d0aeca2995d61a3648a diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..d8797f1ab --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +c0943177594534bfe5527cbf40fe388e diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..f7949c5f3 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +6ed86f234028ae3df5881bee5512f11e diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap deleted file mode 100644 index fee486bab..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -550cd138d6fe31ccdd42cd5392fbd576 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap deleted file mode 100644 index fcf957004..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -9a0ea88e7c9dcf6dc0ef0b601736ffcf diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap deleted file mode 100644 index 29ceb250e..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -d4d5f14e7f1e1f09b86821a0b6defcc6 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap deleted file mode 100644 index bbaf6d2a2..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -3570e0ac0fdb21be9ebe433f59264b56 diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index af99a230b..db6ffedc1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -88,6 +88,7 @@ pub struct IndexDocumentsConfig { pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, + pub disable_soft_deletion: bool, pub autogenerate_docids: bool, } @@ -331,6 +332,7 @@ where // able to simply insert all the documents even if they already exist in the database. if !replaced_documents_ids.is_empty() { let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; + deletion_builder.disable_soft_deletion(self.config.disable_soft_deletion); debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); let deleted_documents_count = deletion_builder.execute()?; @@ -906,6 +908,8 @@ mod tests { { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } ])).unwrap(); + db_snap!(index, word_docids, "initial"); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -928,6 +932,9 @@ mod tests { let count = index.all_documents(&rtxn).unwrap().count(); assert_eq!(count, 6); + db_snap!(index, word_docids, "updated"); + db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]"); + drop(rtxn); } diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap new file mode 100644 index 000000000..5b424356a --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [2, ] +10.0 [1, ] +12 [0, ] +1344 [3, ] +2 [0, ] +23 [5, ] +25.99 [2, ] +3.5 [0, ] +35 [5, ] +4 [4, ] +42 [0, 5, ] +456 [1, ] +adams [5, ] +adventure [1, ] +alice [2, ] +and [0, 4, ] +antoine [1, ] +austin [0, ] +blood [4, ] +carroll [2, ] +de [1, ] +douglas [5, ] +exupery [1, ] +fantasy [2, 3, 4, ] +galaxy [5, ] +guide [5, ] +half [4, ] +harry [4, ] +hitchhiker' [5, ] +hobbit [3, ] +in [2, ] +j [3, 4, ] +jane [0, ] +k [4, ] +le [1, ] +lewis [2, ] +petit [1, ] +potter [4, ] +prejudice [0, ] +pride [0, ] +prince [1, 4, ] +r [3, ] +romance [0, ] +rowling [4, ] +s [5, ] +saint [1, ] +the [3, 4, 5, ] +to [5, ] +tolkien [3, ] +wonderland [2, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..9228ad265 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[0, 1, 4, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap new file mode 100644 index 000000000..4f4a9e33a --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap @@ -0,0 +1,58 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [2, ] +10.0 [1, 7, ] +12 [0, 8, ] +1344 [3, ] +1813 [8, ] +2 [0, 8, ] +23 [5, ] +25.99 [2, ] +3.5 [0, 8, ] +35 [5, ] +4 [4, 6, ] +42 [0, 5, 8, ] +456 [1, 7, ] +adams [5, ] +adventure [1, 7, ] +alice [2, ] +and [0, 4, 6, 8, ] +antoine [1, 7, ] +austen [8, ] +austin [0, ] +blood [4, 6, ] +carroll [2, ] +de [1, 7, ] +douglas [5, ] +exupery [1, 7, ] +fantasy [2, 3, 4, 6, ] +galaxy [5, ] +guide [5, ] +half [4, 6, ] +harry [4, 6, ] +hitchhiker' [5, ] +hobbit [3, ] +in [2, ] +j [3, 4, 6, 8, ] +jane [0, ] +k [4, 6, ] +le [1, ] +lewis [2, ] +little [7, ] +petit [1, ] +potter [4, 6, ] +prejudice [0, 8, ] +pride [0, 8, ] +prince [1, 4, 7, ] +princess [6, ] +r [3, ] +romance [0, 8, ] +rowling [4, 6, ] +s [5, ] +saint [1, 7, ] +the [3, 4, 5, 6, 7, ] +to [5, ] +tolkien [3, ] +wonderland [2, ] + diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 10ea850af..01a4de35e 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -156,30 +156,40 @@ pub fn write_into_lmdb_database_without_merging( #[cfg(test)] mod tests { use std::io::Cursor; + use std::iter::FromIterator; + + use roaring::RoaringBitmap; use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; + use crate::update::{DeleteDocuments, IndexDocumentsMethod}; - fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + fn documents_with_enough_different_words_for_prefixes( + prefixes: &[&str], + start_id: usize, + ) -> Vec { let mut documents = Vec::new(); + let mut id = start_id; for prefix in prefixes { for i in 0..50 { documents.push( serde_json::json!({ + "id": id, "text": format!("{prefix}{i:x}"), }) .as_object() .unwrap() .clone(), - ) + ); + id += 1; } } documents } #[test] - fn test_update() { + fn add_new_documents() { let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); index.index_documents_config.autogenerate_docids = true; @@ -198,10 +208,11 @@ mod tests { DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() }; - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database documents.push( serde_json::json!({ + "id": "9000", "text": "At an amazing and beautiful house" }) .as_object() @@ -210,6 +221,7 @@ mod tests { ); documents.push( serde_json::json!({ + "id": "9001", "text": "The bell rings at 5 am" }) .as_object() @@ -221,10 +233,12 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); documents.push( serde_json::json!({ + "id": "9002", "text": "At an extraordinary house" }) .as_object() @@ -239,7 +253,7 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids, "update"); } #[test] - fn test_batch_bug_3043() { + fn batch_bug_3043() { // https://github.com/meilisearch/meilisearch/issues/3043 let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); @@ -259,7 +273,7 @@ mod tests { DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() }; - let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]); + let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database documents.push( serde_json::json!({ @@ -285,4 +299,291 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids); db_snap!(index, prefix_word_pair_proximity_docids); } + + #[test] + fn hard_delete_and_reupdate() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.disable_soft_deletion(true); + delete.delete_documents(&RoaringBitmap::from_iter([50])); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "first_delete"); + db_snap!(index, word_docids, "first_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.disable_soft_deletion(true); + delete.delete_documents(&RoaringBitmap::from_iter(0..50)); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "second_delete"); + db_snap!(index, word_docids, "second_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "reupdate"); + db_snap!(index, word_docids, "reupdate"); + db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); + db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); + } + + #[test] + fn soft_delete_and_reupdate() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_documents(&RoaringBitmap::from_iter([50])); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "first_delete"); + db_snap!(index, word_docids, "first_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_documents(&RoaringBitmap::from_iter(0..50)); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "second_delete"); + db_snap!(index, word_docids, "second_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "reupdate"); + db_snap!(index, word_docids, "reupdate"); + db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); + db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); + } + + #[test] + fn replace_soft_deletion() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "replaced"); + db_snap!(index, word_docids, "replaced"); + db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); + db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); + db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); + } + + #[test] + fn replace_hard_deletion() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.disable_soft_deletion = true; + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "replaced"); + db_snap!(index, word_docids, "replaced"); + db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); + db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); + db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); + } } diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..6609786a3 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,20 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [101, ] +1 a amazing [100, ] +1 a an [100, ] +1 a and [100, ] +1 a beautiful [100, ] +1 b house [100, ] +1 b rings [101, ] +1 be house [100, ] +1 be rings [101, ] +2 a am [101, ] +2 a amazing [100, ] +2 a and [100, ] +2 a beautiful [100, ] +2 a house [100, ] +2 b at [101, ] +2 be at [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..61987fd4a --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +2 a am [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap new file mode 100644 index 000000000..1caf1a9a3 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +at [51, ] +bell [51, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..41c71ea59 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,11 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 rings a [51, ] +2 at a [51, ] +2 bell a [51, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..267a1c01d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 b rings [51, ] +2 b at [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap new file mode 100644 index 000000000..e5336d58c --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +am [51, ] +at [51, ] +b0 [0, ] +b1 [1, ] +b10 [16, ] +b11 [17, ] +b12 [18, ] +b13 [19, ] +b14 [20, ] +b15 [21, ] +b16 [22, ] +b17 [23, ] +b18 [24, ] +b19 [25, ] +b1a [26, ] +b1b [27, ] +b1c [28, ] +b1d [29, ] +b1e [30, ] +b1f [31, ] +b2 [2, ] +b20 [32, ] +b21 [33, ] +b22 [34, ] +b23 [35, ] +b24 [36, ] +b25 [37, ] +b26 [38, ] +b27 [39, ] +b28 [40, ] +b29 [41, ] +b2a [42, ] +b2b [43, ] +b2c [44, ] +b2d [45, ] +b2e [46, ] +b2f [47, ] +b3 [3, ] +b30 [48, ] +b31 [49, ] +b4 [4, ] +b5 [5, ] +b6 [6, ] +b7 [7, ] +b8 [8, ] +b9 [9, ] +ba [10, ] +bb [11, ] +bc [12, ] +bd [13, ] +be [14, ] +bell [51, ] +bf [15, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..4cdf756ac --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 the b [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap new file mode 100644 index 000000000..4dca775e6 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..61987fd4a --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +2 a am [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap new file mode 100644 index 000000000..7949d464e --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap @@ -0,0 +1,10 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +am [51, ] +at [51, ] +bell [51, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..41c71ea59 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,11 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 rings a [51, ] +2 at a [51, ] +2 bell a [51, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..78b6a3885 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,9 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap new file mode 100644 index 000000000..8c7809973 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +amazing [50, ] +an [50, ] +at [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..65d8b806b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap new file mode 100644 index 000000000..775d41a3d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..54c9e4b9b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 b rings [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap new file mode 100644 index 000000000..f86fdcb8b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +amazing [50, ] +an [50, ] +at [50, ] +b0 [52, ] +b1 [53, ] +b10 [68, ] +b11 [69, ] +b12 [70, ] +b13 [71, ] +b14 [72, ] +b15 [73, ] +b16 [74, ] +b17 [75, ] +b18 [76, ] +b19 [77, ] +b1a [78, ] +b1b [79, ] +b1c [80, ] +b1d [81, ] +b1e [82, ] +b1f [83, ] +b2 [54, ] +b20 [84, ] +b21 [85, ] +b22 [86, ] +b23 [87, ] +b24 [88, ] +b25 [89, ] +b26 [90, ] +b27 [91, ] +b28 [92, ] +b29 [93, ] +b2a [94, ] +b2b [95, ] +b2c [96, ] +b2d [97, ] +b2e [98, ] +b2f [99, ] +b3 [55, ] +b30 [100, ] +b31 [101, ] +b4 [56, ] +b5 [57, ] +b6 [58, ] +b7 [59, ] +b8 [60, ] +b9 [61, ] +ba [62, ] +bb [63, ] +bc [64, ] +bd [65, ] +be [66, ] +bell [51, ] +bf [67, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..4cdf756ac --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 the b [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..78b6a3885 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,9 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap new file mode 100644 index 000000000..8c7809973 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +amazing [50, ] +an [50, ] +at [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..65d8b806b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap new file mode 100644 index 000000000..775d41a3d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..0241f26a5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,10 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +1 b rings [51, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap new file mode 100644 index 000000000..6a481eeee --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5f6443e54fae188aa96d4f27fce28939 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..d20582970 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,8 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +1 the b [51, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap new file mode 100644 index 000000000..c8a1e54b4 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..db62b6566 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,17 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +1 b house [50, ] +1 b rings [51, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] +2 b at [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap new file mode 100644 index 000000000..7fd726325 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +9f4866b80177e321a33ce434992022b5 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..8a684b16d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,21 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 and b [50, ] +1 at a [50, ] +1 rings a [51, ] +1 the b [51, ] +2 amazing b [50, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 an b [50, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 at b [50, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap new file mode 100644 index 000000000..4dca775e6 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + From 8782c1785c28fd667ac5a6bf4ac57577c4497e52 Mon Sep 17 00:00:00 2001 From: "bors[bot]" <26634292+bors[bot]@users.noreply.github.com> Date: Tue, 6 Dec 2022 14:37:38 +0000 Subject: [PATCH 4/4] Merge #723 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 723: Fix bug in handling of soft deleted documents when updating settings r=Kerollmops a=loiclec # Pull Request ## Related issue Fixes (partially, until merged into meilisearch) https://github.com/meilisearch/meilisearch/issues/3021 ## What does this PR do? This PR fixes the bug where a `missing key in documents database` internal error message could appear when indexing documents. When updating the settings, before clearing the database and before creating the transform output, we now modify the `ExternalDocumentsIds` structure to get rid of all references to soft deleted document ids in its FSTs. It used to be that updating the settings would clear the soft-deleted document ids, but keep the original `ExternalDocumentsIds` structure. As a consequence of this, when processing a future document addition, we could wrongly believe that a document was being replaced when, in fact, it was a completely new document. See the tests `bug_3021_first`, `bug_3021_second`, and `bug_3021` for a minimal test case that would have reproduced the issue. We need to take special care to: - evaluate how users should update to v0.30.1 (containing this fix): dump? reimporting all documents from scratch? - understand IF/HOW this bug could have caused duplicate documents to be returned - and evaluate the correctness of the fix, of course :) Co-authored-by: Loïc Lecrenier --- milli/src/external_documents_ids.rs | 24 + milli/src/index.rs | 417 +++++++++++------- milli/src/update/index_documents/transform.rs | 34 +- milli/src/update/settings.rs | 19 +- 4 files changed, 311 insertions(+), 183 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 6029722af..64b294541 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -71,6 +71,30 @@ impl<'a> ExternalDocumentsIds<'a> { self.merge_soft_into_hard() } + /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they + /// don't contain any soft deleted document id. + pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { + let mut new_hard_builder = fst::MapBuilder::memory(); + + let union_op = self.hard.op().add(&self.soft).r#union(); + let mut iter = union_op.into_stream(); + while let Some((external_id, docids)) = iter.next() { + // prefer selecting the ids from soft, always + let id = indexed_last_value(docids).unwrap(); + if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { + new_hard_builder.insert(external_id, id)?; + } + } + drop(iter); + + // Delete soft map completely + self.soft = fst::Map::default().map_data(Cow::Owned)?; + // We save the new map as the new hard map. + self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + + Ok(()) + } + pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { let union_op = self.soft.op().add(other).r#union(); diff --git a/milli/src/index.rs b/milli/src/index.rs index 9928676f0..6610bfd9e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1185,13 +1185,15 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; + use maplit::hashset; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, + self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + IndexerConfig, Settings, }; use crate::{db_snap, obkv_to_json, Index}; @@ -1485,7 +1487,7 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let mut index = TempIndex::new(); + let index = TempIndex::new(); index .update_settings(|settings| { @@ -1544,7 +1546,6 @@ pub(crate) mod tests { 1 0 3 1 [3, 6, ] "###); - index.index_documents_config.disable_soft_deletion = false; index .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) .unwrap(); @@ -1568,7 +1569,6 @@ pub(crate) mod tests { 1 0 4 1 [7, ] "###); - index.index_documents_config.disable_soft_deletion = false; index .update_settings(|settings| { settings.set_distinct_field("id".to_owned()); @@ -1576,37 +1576,13 @@ pub(crate) mod tests { .unwrap(); db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); - db_snap!(index, external_documents_ids, 3, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); - db_snap!(index, facet_id_f64_docids, 3, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] - "###); - - index.index_documents_config.disable_soft_deletion = true; - index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); db_snap!(index, external_documents_ids, 3, @r###" soft: - 3 7 hard: 0 4 1 5 2 6 - 3 3 + 3 7 "###); db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); db_snap!(index, facet_id_f64_docids, 3, @r###" @@ -1619,140 +1595,6 @@ pub(crate) mod tests { 1 0 3 1 [6, ] 1 0 4 1 [7, ] "###); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - let docid = delete.delete_external_id("3").unwrap(); - insta::assert_snapshot!(format!("{docid}"), @"7"); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, @"[4, 5, 6, ]"); - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 0 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [0, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 7, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 5 }])).unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 1 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [0, 1, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 7, ] - 1 0 5 1 [1, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 5, "id": 2, "doggo": 4 }])).unwrap(); - db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: - 0 4 - 1 5 - 2 2 - 3 1 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [2, 6, ] - 0 0 3 1 [0, 1, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 2, 7, ] - 1 0 5 1 [1, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index - .add_documents(documents!([{ "id": 4, "doggo": 5 }, { "id": 3, "doggo": 5 }])) - .unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 4 3 - hard: - 0 4 - 1 5 - 2 2 - 3 1 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [2, 6, ] - 0 0 3 1 [0, 1, 7, ] - 0 0 4 1 [3, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 2, 7, ] - 1 0 5 1 [1, 3, ] - "###); } #[test] @@ -2020,4 +1862,253 @@ pub(crate) mod tests { drop(rtxn); } } + + #[test] + fn bug_3021_first() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 38 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("34"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that this document addition works correctly now. + // It would be wrongly interpreted as a replacement before + index.add_documents(documents!({ "primary_key": 34 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // We do the test again, but deleting the document with id 0 instead of id 1 now + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("38"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec!["primary_key".to_owned()]); + }) + .unwrap(); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + + // And adding lots of documents afterwards instead of just one. + // These extra subtests don't add much, but it's better than nothing. + index.add_documents(documents!([{ "primary_key": 38 }, { "primary_key": 39 }, { "primary_key": 41 }, { "primary_key": 40 }, { "primary_key": 41 }, { "primary_key": 42 }])).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 7, @r###" + soft: + hard: + 34 1 + 38 0 + 39 2 + 40 4 + 41 3 + 42 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 7, @"[]"); + } + + #[test] + fn bug_3021_second() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 30 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("34"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 30 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that when we add a new document + index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + // The external documents ids don't have several external ids pointing to the same + // internal document id + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 30 0 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // And when we add 34 again, we don't replace document 35 + index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap(); + + // And document 35 still exists, is not deleted + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[]"); + + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(30), + } + "###); + + // Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34 + let (_docid, obkv) = index.documents(&rtxn, [2]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(34), + "a": Number(1), + } + "###); + + drop(rtxn); + + // Add new documents again + index + .add_documents( + documents!([{ "primary_key": 37 }, { "primary_key": 38 }, { "primary_key": 39 }]), + ) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + 37 3 + 38 4 + 39 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 57aa02e04..f414569b9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, @@ -546,12 +546,13 @@ impl<'a, 'i> Transform<'a, 'i> { }) } - /// Returns a `TransformOutput` with a file that contains the documents of the index - /// with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents + /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + /// // TODO this can be done in parallel by using the rayon `ThreadPool`. - pub fn remap_index_documents( + pub fn prepare_for_documents_reindexing( self, - wtxn: &mut heed::RwTxn, + wtxn: &mut heed::RwTxn<'i, '_>, old_fields_ids_map: FieldsIdsMap, mut new_fields_ids_map: FieldsIdsMap, ) -> Result { @@ -559,7 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> { let primary_key = self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string(); let field_distribution = self.index.field_distribution(wtxn)?; - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + + // Delete the soft deleted document ids from the maps inside the external_document_ids structure + let new_external_documents_ids = { + let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; + external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + external_documents_ids + }; + let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; @@ -638,17 +646,25 @@ impl<'a, 'i> Transform<'a, 'i> { let mut flattened_documents = flattened_writer.into_inner()?; flattened_documents.seek(SeekFrom::Start(0))?; - Ok(TransformOutput { + let output = TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - external_documents_ids: external_documents_ids.into_static(), + external_documents_ids: new_external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count, original_documents, flattened_documents, - }) + }; + + let new_facets = output.compute_real_facets(wtxn, self.index)?; + self.index.put_faceted_fields(wtxn, &new_facets)?; + + // We clear the full database (words-fst, documents ids and documents content). + ClearDocuments::new(wtxn, self.index).execute()?; + + Ok(output) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6da32d73f..8fcdafffe 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,7 +12,7 @@ use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::index_documents::IndexDocumentsMethod; -use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] @@ -291,15 +291,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { false, )?; - // We remap the documents fields based on the new `FieldsIdsMap`. - let output = - transform.remap_index_documents(self.wtxn, old_fields_ids_map, fields_ids_map)?; - - let new_facets = output.compute_real_facets(self.wtxn, self.index)?; - self.index.put_faceted_fields(self.wtxn, &new_facets)?; - - // We clear the full database (words-fst, documents ids and documents content). - ClearDocuments::new(self.wtxn, self.index).execute()?; + // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. + let output = transform.prepare_for_documents_reindexing( + self.wtxn, + old_fields_ids_map, + fields_ids_map, + )?; // We index the generated `TransformOutput` which must contain // all the documents with fields in the newly defined searchable order. @@ -719,7 +716,7 @@ mod tests { use super::*; use crate::error::Error; use crate::index::tests::TempIndex; - use crate::update::DeleteDocuments; + use crate::update::{ClearDocuments, DeleteDocuments}; use crate::{Criterion, Filter, SearchResult}; #[test]