From ae80703dd44056ba1687e0f71748c8e0c7681429 Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Thu, 9 May 2024 17:25:41 -0700 Subject: [PATCH 1/2] std.hash_map: adding a rehash() method --- lib/std/hash_map.zig | 135 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/lib/std/hash_map.zig b/lib/std/hash_map.zig index e3b3e5881439..7f37a9c96d73 100644 --- a/lib/std/hash_map.zig +++ b/lib/std/hash_map.zig @@ -694,6 +694,21 @@ pub fn HashMap( self.unmanaged = .{}; return result; } + + /// Rehash the map, in-place. + /// + /// Over time, due to the current tombstone-based implementation, a + /// HashMap could become fragmented due to the buildup of tombstone + /// entries that causes a performance degradation due to excessive + /// probing. The kind of pattern that might cause this is a long-lived + /// HashMap with repeated inserts and deletes. + /// + /// After this function is called, there will be no tombstones in + /// the HashMap, each of the entries is rehashed and any existing + /// key/value pointers into the HashMap are invalidated. + pub fn rehash(self: *Self) void { + self.unmanaged.rehash(self.ctx); + } }; } @@ -1552,6 +1567,95 @@ pub fn HashMapUnmanaged( return result; } + /// Rehash the map, in-place. + /// + /// Over time, due to the current tombstone-based implementation, a + /// HashMap could become fragmented due to the buildup of tombstone + /// entries that causes a performance degradation due to excessive + /// probing. The kind of pattern that might cause this is a long-lived + /// HashMap with repeated inserts and deletes. + /// + /// After this function is called, there will be no tombstones in + /// the HashMap, each of the entries is rehashed and any existing + /// key/value pointers into the HashMap are invalidated. + pub fn rehash(self: *Self, ctx: anytype) void { + const mask = self.capacity() - 1; + + var metadata = self.metadata.?; + var keys_ptr = self.keys(); + var values_ptr = self.values(); + var curr: Size = 0; + + // While we are re-hashing every slot, we will use the + // fingerprint to mark used buckets as being used and either free + // (needing to be rehashed) or tombstone (already rehashed). + + while (curr < self.capacity()) : (curr += 1) { + metadata[curr].fingerprint = Metadata.free; + } + + // Now iterate over all the buckets, rehashing them + + curr = 0; + while (curr < self.capacity()) { + if (!metadata[curr].isUsed()) { + assert(metadata[curr].isFree()); + curr += 1; + continue; + } + + const hash = ctx.hash(keys_ptr[curr]); + const fingerprint = Metadata.takeFingerprint(hash); + var idx = @as(usize, @truncate(hash & mask)); + + // For each bucket, rehash to an index: + // 1) before the cursor, probed into a free slot, or + // 2) equal to the cursor, no need to move, or + // 3) ahead of the cursor, probing over already rehashed + + while ((idx < curr and metadata[idx].isUsed()) or + (idx > curr and metadata[idx].fingerprint == Metadata.tombstone)) + { + idx = (idx + 1) & mask; + } + + if (idx < curr) { + assert(metadata[idx].isFree()); + metadata[idx].fill(fingerprint); + keys_ptr[idx] = keys_ptr[curr]; + values_ptr[idx] = values_ptr[curr]; + + metadata[curr].used = 0; + assert(metadata[curr].isFree()); + keys_ptr[curr] = undefined; + values_ptr[curr] = undefined; + + curr += 1; + } else if (idx == curr) { + metadata[idx].fingerprint = fingerprint; + curr += 1; + } else { + assert(metadata[idx].fingerprint != Metadata.tombstone); + metadata[idx].fingerprint = Metadata.tombstone; + if (metadata[idx].isUsed()) { + std.mem.swap(K, &keys_ptr[curr], &keys_ptr[idx]); + std.mem.swap(V, &values_ptr[curr], &values_ptr[idx]); + } else { + metadata[idx].used = 1; + keys_ptr[idx] = keys_ptr[curr]; + values_ptr[idx] = values_ptr[curr]; + + metadata[curr].fingerprint = Metadata.free; + metadata[curr].used = 0; + keys_ptr[curr] = undefined; + values_ptr[curr] = undefined; + + curr += 1; + } + } + } + } + fn grow(self: *Self, allocator: Allocator, new_capacity: Size, ctx: Context) Allocator.Error!void { @setCold(true); const new_cap = @max(new_capacity, minimal_capacity); @@ -2272,3 +2376,34 @@ test "getOrPut allocation failure" { var map: std.StringHashMapUnmanaged(void) = .{}; try testing.expectError(error.OutOfMemory, map.getOrPut(std.testing.failing_allocator, "hello")); } + +test "std.hash_map rehash" { + var map = AutoHashMap(u32, u32).init(std.testing.allocator); + defer map.deinit(); + + var prng = std.rand.DefaultPrng.init(0); + const random = prng.random(); + + const count = 6 * random.intRangeLessThan(u32, 100_000, 500_000); + + var i: u32 = 0; + while (i < count) : (i += 1) { + try map.put(i, i); + if (i % 3 == 0) { + try expectEqual(map.remove(i), true); + } + } + + map.rehash(); + + try expectEqual(map.count(), count * 2 / 3); + + i = 0; + while (i < count) : (i += 1) { + if (i % 3 == 0) { + try expectEqual(map.get(i), null); + } else { + try expectEqual(map.get(i).?, i); + } + } +} From 73832df0270a9e4a2029af66b5e83ca334626e4c Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Fri, 10 May 2024 08:29:12 -0700 Subject: [PATCH 2/2] std.hash_map: cleanup test for rehash() --- lib/std/hash_map.zig | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/std/hash_map.zig b/lib/std/hash_map.zig index 7f37a9c96d73..905d44c3687d 100644 --- a/lib/std/hash_map.zig +++ b/lib/std/hash_map.zig @@ -2378,16 +2378,15 @@ test "getOrPut allocation failure" { } test "std.hash_map rehash" { - var map = AutoHashMap(u32, u32).init(std.testing.allocator); + var map = AutoHashMap(usize, usize).init(std.testing.allocator); defer map.deinit(); - var prng = std.rand.DefaultPrng.init(0); + var prng = std.Random.DefaultPrng.init(0); const random = prng.random(); const count = 6 * random.intRangeLessThan(u32, 100_000, 500_000); - var i: u32 = 0; - while (i < count) : (i += 1) { + for (0..count) |i| { try map.put(i, i); if (i % 3 == 0) { try expectEqual(map.remove(i), true); @@ -2398,8 +2397,7 @@ test "std.hash_map rehash" { try expectEqual(map.count(), count * 2 / 3); - i = 0; - while (i < count) : (i += 1) { + for (0..count) |i| { if (i % 3 == 0) { try expectEqual(map.get(i), null); } else {