From 5e862e601db8f4603321143dad25e40588aa3008 Mon Sep 17 00:00:00 2001 From: JinyongHa Date: Mon, 21 Feb 2022 02:09:17 +0000 Subject: [PATCH] dedup-tool: add sampling mode to crawling The shallow mode aims runtime deduplication with lower overhead. It tries deduplication with sampled few objects in base-pool, and only deduplicates objects or chunks which are highly duplicated among samples. The deep mode crawls all object and deduplicates them. It is sutible for background deduplication which targets to find all duplication even with high overhead. The option "sampling-ratio" is used for controling the ratio of the objects to be picked. --- src/tools/ceph_dedup_tool.cc | 76 +++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index 8ac95adb915ebc..f0c2713a02ad6d 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -158,6 +158,8 @@ void usage() cout << " --max-read-size " << std::endl; cout << " --object-dedup-threshold " << std::endl; cout << " --chunk-dedup-threshold " << std::endl; + cout << " --sampling-ratio " << std::endl; + cout << " --shallow-crawling" << std::endl; cout << "explanations: " << std::endl; cout << " chunk-dedup performs deduplication using a chunk generated by given source" << std::endl; cout << " offset and length. object-dedup deduplicates the entire object, not a chunk" << std::endl; @@ -562,6 +564,11 @@ void ChunkScrub::chunk_scrub_common() class SampleDedup : public CrawlerThread { public: + enum class crawl_mode_t { + DEEP, + SHALLOW, + }; + SampleDedup( IoCtx& io_ctx, IoCtx& chunk_io_ctx, @@ -571,12 +578,16 @@ class SampleDedup : public CrawlerThread ObjectCursor end, int32_t report_period, uint64_t num_objects, + uint32_t sampling_ratio, uint32_t object_dedup_threshold, uint32_t chunk_dedup_threshold, size_t chunk_size, - std::string& fp_algo) : + std::string& fp_algo, + crawl_mode_t mode) : CrawlerThread(io_ctx, n, m, begin, end, report_period, num_objects), chunk_io_ctx(chunk_io_ctx), + mode(mode), + sampling_ratio(sampling_ratio), chunk_dedup_threshold(chunk_dedup_threshold), object_dedup_threshold(object_dedup_threshold), chunk_size(chunk_size), @@ -638,6 +649,8 @@ class SampleDedup : public CrawlerThread Rados rados; IoCtx chunk_io_ctx; + crawl_mode_t mode; + uint32_t sampling_ratio; std::list duplicable_chunks; size_t total_duplicated_size = 0; size_t total_object_size = 0; @@ -707,6 +720,7 @@ class SampleDedup : public CrawlerThread }; SampleDedup::FpStore SampleDedup::fp_store; +SampleDedup::crawl_mode_t default_crawl_mode = SampleDedup::crawl_mode_t::DEEP; std::unordered_set SampleDedup::flushed_objects; std::shared_mutex SampleDedup::flushed_lock; @@ -738,9 +752,9 @@ void SampleDedup::crawl() { shard_end, 100); - // Pick few objects to be processed. Crawling mode decides how many objects - // to pick (sampling ratio). Lower sampling ratio makes crawler to has lower - // crawling overhead but finds less duplication. + // Pick few objects to be processed. Crawling mode decides how many + // objects to pick (sampling ratio). Lower sampling ratio makes crawler + // have lower crawling overhead but find less duplication. std::set sampled_indexes = sample_object(objects.size()); for (size_t index : sampled_indexes) { ObjectItem target = objects[index]; @@ -846,8 +860,31 @@ std::tuple, ObjectCursor> SampleDedup::get_objects( std::set SampleDedup::sample_object(size_t count) { std::set indexes; - for (size_t index = 0 ; index < count ; index++) { - indexes.insert(index); + switch(mode) { + // DEEP mode crawls all objects. Crawling has high overhead but finds all + // duplication + case crawl_mode_t::DEEP: { + for (size_t index = 0 ; index < count ; index++) { + indexes.insert(index); + } + break; + } + + // SHALLOW mode crawls a few objects. Crawling has low overhead but finds + // all less duplication + case crawl_mode_t::SHALLOW: { + size_t sampling_count = count * sampling_ratio / 100; + default_random_engine generator; + uniform_int_distribution distribution(0, count - 1); + while (indexes.size() < sampling_count) { + size_t index = distribution(generator); + indexes.insert(index); + } + break; + } + + default: + assert(false); } return indexes; } @@ -1755,6 +1792,19 @@ int make_crawling_daemon(const map &opts, } } + SampleDedup::crawl_mode_t crawl_mode = default_crawl_mode; + i = opts.find("shallow-crawling"); + if (i != opts.end()) { + crawl_mode = SampleDedup::crawl_mode_t::SHALLOW; + } + uint32_t sampling_ratio = 50; + i = opts.find("sampling-ratio"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &sampling_ratio)) { + return -EINVAL; + } + } + uint32_t object_dedup_threshold = 50; i = opts.find("object-dedup-threshold"); if (i != opts.end()) { @@ -1842,10 +1892,12 @@ int make_crawling_daemon(const map &opts, cerr << " operate fail : " << cpp_strerror(ret) << std::endl; return ret; } - - cout << "Object Dedup Threshold : " << object_dedup_threshold << std::endl + + cout << "SampleRatio : " << sampling_ratio << std::endl + << "Object Dedup Threshold : " << object_dedup_threshold << std::endl << "Chunk Dedup Threshold : " << chunk_dedup_threshold << std::endl << "Chunk Size : " << chunk_size << std::endl + << "Mode : " << ((crawl_mode == SampleDedup::crawl_mode_t::DEEP) ? "DEEP" : "SHALOW") << std::endl; while (true) { @@ -1884,10 +1936,12 @@ int make_crawling_daemon(const map &opts, end, report_period, s.num_objects, + sampling_ratio, object_dedup_threshold, chunk_dedup_threshold, chunk_size, - fp_algo)); + fp_algo, + crawl_mode)); ptr->set_debug(debug); ptr->create("sample_dedup"); estimate_threads.push_back(move(ptr)); @@ -1968,10 +2022,14 @@ int main(int argc, const char **argv) opts["source-length"] = val; } else if (ceph_argparse_witharg(args, i, &val, "--dedup-cdc-chunk-size", (char*)NULL)) { opts["dedup-cdc-chunk-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--sampling-ratio", (char*)NULL)){ + opts["sampling-ratio"] = val; } else if (ceph_argparse_witharg(args, i, &val, "--object-dedup-threshold", (char*)NULL)) { opts["object-dedup-threshold"] = val; } else if (ceph_argparse_witharg(args, i, &val, "--chunk-dedup-threshold", (char*)NULL)) { opts["chunk-dedup-threshold"] = val; + } else if (ceph_argparse_flag(args, i, "--shallow-crawling", (char*)NULL)) { + opts["shallow-crawling"] = true; } else if (ceph_argparse_flag(args, i, "--debug", (char*)NULL)) { opts["debug"] = "true"; } else {