Skip to content

Commit

Permalink
dedup-tool: add sampling mode to crawling
Browse files Browse the repository at this point in the history
The shallow mode aims runtime deduplication with lower overhead. It tries
deduplication with sampled few objects in base-pool, and only deduplicates
objects or chunks which are highly duplicated among samples.
The deep mode crawls all object and deduplicates them. It is sutible for
background deduplication which targets to find all duplication even with
high overhead.
The option "sampling-ratio" is used for controling the ratio of the objects
to be picked.
  • Loading branch information
jyha200 committed Feb 24, 2022
1 parent c344821 commit 5e862e6
Showing 1 changed file with 67 additions and 9 deletions.
76 changes: 67 additions & 9 deletions src/tools/ceph_dedup_tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ void usage()
cout << " --max-read-size <bytes> " << std::endl;
cout << " --object-dedup-threshold <percentile>" << std::endl;
cout << " --chunk-dedup-threshold <number>" << std::endl;
cout << " --sampling-ratio <percentile>" << std::endl;
cout << " --shallow-crawling" << std::endl;
cout << "explanations: " << std::endl;
cout << " chunk-dedup performs deduplication using a chunk generated by given source" << std::endl;
cout << " offset and length. object-dedup deduplicates the entire object, not a chunk" << std::endl;
Expand Down Expand Up @@ -562,6 +564,11 @@ void ChunkScrub::chunk_scrub_common()
class SampleDedup : public CrawlerThread
{
public:
enum class crawl_mode_t {
DEEP,
SHALLOW,
};

SampleDedup(
IoCtx& io_ctx,
IoCtx& chunk_io_ctx,
Expand All @@ -571,12 +578,16 @@ class SampleDedup : public CrawlerThread
ObjectCursor end,
int32_t report_period,
uint64_t num_objects,
uint32_t sampling_ratio,
uint32_t object_dedup_threshold,
uint32_t chunk_dedup_threshold,
size_t chunk_size,
std::string& fp_algo) :
std::string& fp_algo,
crawl_mode_t mode) :
CrawlerThread(io_ctx, n, m, begin, end, report_period, num_objects),
chunk_io_ctx(chunk_io_ctx),
mode(mode),
sampling_ratio(sampling_ratio),
chunk_dedup_threshold(chunk_dedup_threshold),
object_dedup_threshold(object_dedup_threshold),
chunk_size(chunk_size),
Expand Down Expand Up @@ -638,6 +649,8 @@ class SampleDedup : public CrawlerThread

Rados rados;
IoCtx chunk_io_ctx;
crawl_mode_t mode;
uint32_t sampling_ratio;
std::list<chunk_t> duplicable_chunks;
size_t total_duplicated_size = 0;
size_t total_object_size = 0;
Expand Down Expand Up @@ -707,6 +720,7 @@ class SampleDedup : public CrawlerThread
};

SampleDedup::FpStore SampleDedup::fp_store;
SampleDedup::crawl_mode_t default_crawl_mode = SampleDedup::crawl_mode_t::DEEP;
std::unordered_set<std::string> SampleDedup::flushed_objects;
std::shared_mutex SampleDedup::flushed_lock;

Expand Down Expand Up @@ -738,9 +752,9 @@ void SampleDedup::crawl() {
shard_end,
100);

// Pick few objects to be processed. Crawling mode decides how many objects
// to pick (sampling ratio). Lower sampling ratio makes crawler to has lower
// crawling overhead but finds less duplication.
// Pick few objects to be processed. Crawling mode decides how many
// objects to pick (sampling ratio). Lower sampling ratio makes crawler
// have lower crawling overhead but find less duplication.
std::set<size_t> sampled_indexes = sample_object(objects.size());
for (size_t index : sampled_indexes) {
ObjectItem target = objects[index];
Expand Down Expand Up @@ -846,8 +860,31 @@ std::tuple<std::vector<ObjectItem>, ObjectCursor> SampleDedup::get_objects(

std::set<size_t> SampleDedup::sample_object(size_t count) {
std::set<size_t> indexes;
for (size_t index = 0 ; index < count ; index++) {
indexes.insert(index);
switch(mode) {
// DEEP mode crawls all objects. Crawling has high overhead but finds all
// duplication
case crawl_mode_t::DEEP: {
for (size_t index = 0 ; index < count ; index++) {
indexes.insert(index);
}
break;
}

// SHALLOW mode crawls a few objects. Crawling has low overhead but finds
// all less duplication
case crawl_mode_t::SHALLOW: {
size_t sampling_count = count * sampling_ratio / 100;
default_random_engine generator;
uniform_int_distribution<size_t> distribution(0, count - 1);
while (indexes.size() < sampling_count) {
size_t index = distribution(generator);
indexes.insert(index);
}
break;
}

default:
assert(false);
}
return indexes;
}
Expand Down Expand Up @@ -1755,6 +1792,19 @@ int make_crawling_daemon(const map<string, string> &opts,
}
}

SampleDedup::crawl_mode_t crawl_mode = default_crawl_mode;
i = opts.find("shallow-crawling");
if (i != opts.end()) {
crawl_mode = SampleDedup::crawl_mode_t::SHALLOW;
}
uint32_t sampling_ratio = 50;
i = opts.find("sampling-ratio");
if (i != opts.end()) {
if (rados_sistrtoll(i, &sampling_ratio)) {
return -EINVAL;
}
}

uint32_t object_dedup_threshold = 50;
i = opts.find("object-dedup-threshold");
if (i != opts.end()) {
Expand Down Expand Up @@ -1842,10 +1892,12 @@ int make_crawling_daemon(const map<string, string> &opts,
cerr << " operate fail : " << cpp_strerror(ret) << std::endl;
return ret;
}

cout << "Object Dedup Threshold : " << object_dedup_threshold << std::endl

cout << "SampleRatio : " << sampling_ratio << std::endl
<< "Object Dedup Threshold : " << object_dedup_threshold << std::endl
<< "Chunk Dedup Threshold : " << chunk_dedup_threshold << std::endl
<< "Chunk Size : " << chunk_size << std::endl
<< "Mode : " << ((crawl_mode == SampleDedup::crawl_mode_t::DEEP) ? "DEEP" : "SHALOW")
<< std::endl;

while (true) {
Expand Down Expand Up @@ -1884,10 +1936,12 @@ int make_crawling_daemon(const map<string, string> &opts,
end,
report_period,
s.num_objects,
sampling_ratio,
object_dedup_threshold,
chunk_dedup_threshold,
chunk_size,
fp_algo));
fp_algo,
crawl_mode));
ptr->set_debug(debug);
ptr->create("sample_dedup");
estimate_threads.push_back(move(ptr));
Expand Down Expand Up @@ -1968,10 +2022,14 @@ int main(int argc, const char **argv)
opts["source-length"] = val;
} else if (ceph_argparse_witharg(args, i, &val, "--dedup-cdc-chunk-size", (char*)NULL)) {
opts["dedup-cdc-chunk-size"] = val;
} else if (ceph_argparse_witharg(args, i, &val, "--sampling-ratio", (char*)NULL)){
opts["sampling-ratio"] = val;
} else if (ceph_argparse_witharg(args, i, &val, "--object-dedup-threshold", (char*)NULL)) {
opts["object-dedup-threshold"] = val;
} else if (ceph_argparse_witharg(args, i, &val, "--chunk-dedup-threshold", (char*)NULL)) {
opts["chunk-dedup-threshold"] = val;
} else if (ceph_argparse_flag(args, i, "--shallow-crawling", (char*)NULL)) {
opts["shallow-crawling"] = true;
} else if (ceph_argparse_flag(args, i, "--debug", (char*)NULL)) {
opts["debug"] = "true";
} else {
Expand Down

0 comments on commit 5e862e6

Please sign in to comment.