Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

std.hash_map: adding a rehash() method #19923

Merged
merged 2 commits into from
Aug 8, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions lib/std/hash_map.zig
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,21 @@ pub fn HashMap(
self.unmanaged = .{};
return result;
}

/// Rehash the map, in-place.
///
/// Over time, due to the current tombstone-based implementation, a
/// HashMap could become fragmented due to the buildup of tombstone
/// entries that causes a performance degradation due to excessive
/// probing. The kind of pattern that might cause this is a long-lived
/// HashMap with repeated inserts and deletes.
///
/// After this function is called, there will be no tombstones in
/// the HashMap, each of the entries is rehashed and any existing
/// key/value pointers into the HashMap are invalidated.
pub fn rehash(self: *Self) void {
self.unmanaged.rehash(self.ctx);
}
};
}

Expand Down Expand Up @@ -1552,6 +1567,95 @@ pub fn HashMapUnmanaged(
return result;
}

/// Rehash the map, in-place.
///
/// Over time, due to the current tombstone-based implementation, a
/// HashMap could become fragmented due to the buildup of tombstone
/// entries that causes a performance degradation due to excessive
/// probing. The kind of pattern that might cause this is a long-lived
/// HashMap with repeated inserts and deletes.
///
/// After this function is called, there will be no tombstones in
/// the HashMap, each of the entries is rehashed and any existing
/// key/value pointers into the HashMap are invalidated.
pub fn rehash(self: *Self, ctx: anytype) void {
const mask = self.capacity() - 1;

var metadata = self.metadata.?;
var keys_ptr = self.keys();
var values_ptr = self.values();
var curr: Size = 0;

// While we are re-hashing every slot, we will use the
// fingerprint to mark used buckets as being used and either free
// (needing to be rehashed) or tombstone (already rehashed).

while (curr < self.capacity()) : (curr += 1) {
metadata[curr].fingerprint = Metadata.free;
}

// Now iterate over all the buckets, rehashing them

curr = 0;
while (curr < self.capacity()) {
if (!metadata[curr].isUsed()) {
assert(metadata[curr].isFree());
curr += 1;
continue;
}

const hash = ctx.hash(keys_ptr[curr]);
const fingerprint = Metadata.takeFingerprint(hash);
var idx = @as(usize, @truncate(hash & mask));

// For each bucket, rehash to an index:
// 1) before the cursor, probed into a free slot, or
// 2) equal to the cursor, no need to move, or
// 3) ahead of the cursor, probing over already rehashed

while ((idx < curr and metadata[idx].isUsed()) or
(idx > curr and metadata[idx].fingerprint == Metadata.tombstone))
{
idx = (idx + 1) & mask;
}

if (idx < curr) {
assert(metadata[idx].isFree());
metadata[idx].fill(fingerprint);
keys_ptr[idx] = keys_ptr[curr];
values_ptr[idx] = values_ptr[curr];

metadata[curr].used = 0;
assert(metadata[curr].isFree());
keys_ptr[curr] = undefined;
values_ptr[curr] = undefined;

curr += 1;
} else if (idx == curr) {
metadata[idx].fingerprint = fingerprint;
curr += 1;
} else {
assert(metadata[idx].fingerprint != Metadata.tombstone);
metadata[idx].fingerprint = Metadata.tombstone;
if (metadata[idx].isUsed()) {
std.mem.swap(K, &keys_ptr[curr], &keys_ptr[idx]);
std.mem.swap(V, &values_ptr[curr], &values_ptr[idx]);
} else {
metadata[idx].used = 1;
keys_ptr[idx] = keys_ptr[curr];
values_ptr[idx] = values_ptr[curr];

metadata[curr].fingerprint = Metadata.free;
metadata[curr].used = 0;
keys_ptr[curr] = undefined;
values_ptr[curr] = undefined;

curr += 1;
}
}
}
}

fn grow(self: *Self, allocator: Allocator, new_capacity: Size, ctx: Context) Allocator.Error!void {
@setCold(true);
const new_cap = @max(new_capacity, minimal_capacity);
Expand Down Expand Up @@ -2272,3 +2376,32 @@ test "getOrPut allocation failure" {
var map: std.StringHashMapUnmanaged(void) = .{};
try testing.expectError(error.OutOfMemory, map.getOrPut(std.testing.failing_allocator, "hello"));
}

test "std.hash_map rehash" {
var map = AutoHashMap(usize, usize).init(std.testing.allocator);
defer map.deinit();

var prng = std.Random.DefaultPrng.init(0);
const random = prng.random();

const count = 6 * random.intRangeLessThan(u32, 100_000, 500_000);

for (0..count) |i| {
try map.put(i, i);
if (i % 3 == 0) {
try expectEqual(map.remove(i), true);
}
}

map.rehash();

try expectEqual(map.count(), count * 2 / 3);

for (0..count) |i| {
if (i % 3 == 0) {
try expectEqual(map.get(i), null);
} else {
try expectEqual(map.get(i).?, i);
}
}
}