From d9d7358a233ebe48e896a26e05fa3cffe30f36d6 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Thu, 18 Apr 2024 22:03:30 +0300 Subject: [PATCH 01/12] shard: give test shard an ID Prevent panic on test error: --- FAIL: TestDumpIgnoreErrors (2.09s) dump_test.go:399: Error Trace: /home/rik/neofs-node/pkg/local_object_storage/shard/dump_test.go:399 Error: Not equal: expected: 10 actual : 11 Test: TestDumpIgnoreErrors panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0xa365a0] goroutine 1156 [running]: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/shard.New.func1({0xbf05fb, 0x1e}, {0xd7ad80, 0xc000458bd0}) /home/rik/neofs-node/pkg/local_object_storage/shard/shard.go:138 +0x40 github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).reportFlushError(0xe6812a13a28266c3?, {0xbf05fb?, 0xb7e97c32721668a?}, {0xc0000a05a0?, 0x80008050838d2b0d?}, {0xd7ad80?, 0xc000458bd0?}) /home/rik/neofs-node/pkg/local_object_storage/writecache/flush.go:167 +0x25d github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).flushObject(0xc0004d62c0, 0xc0003a8480, {0x0, 0x0, 0x0}) /home/rik/neofs-node/pkg/local_object_storage/writecache/flush.go:272 +0x348 github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).flushWorker(0xc0004d62c0, 0x0?) /home/rik/neofs-node/pkg/local_object_storage/writecache/flush.go:241 +0x12d created by github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).runFlushLoop /home/rik/neofs-node/pkg/local_object_storage/writecache/flush.go:35 +0x30 panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0xa365a0] goroutine 1035 [running]: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/shard.New.func1({0xbf05fb, 0x1e}, {0xd7ad80, 0xc0004d1500}) /home/rik/neofs-node/pkg/local_object_storage/shard/shard.go:138 +0x40 github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).reportFlushError(0xe6812a13a28266c3?, {0xbf05fb?, 0xb7e97c32721668a?}, {0xc00044c660?, 0x69f6f53b67fffa56?}, {0xd7ad80?, 0xc0004d1500?}) /home/rik/neofs-node/pkg/local_object_storage/writecache/flush.go:167 +0x25d github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).flushObject(0xc0004d62c0, 0xc0003a8420, {0x0, 0x0, 0x0}) /home/rik/neofs-node/pkg/local_object_storage/writecache/flush.go:272 +0x348 github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).flushWorker(0xc0004d62c0, 0xc0004d62c0?) /home/rik/neofs-node/pkg/local_object_storage/writecache/flush.go:241 +0x12d created by github.com/nspcc-dev/neofs-node/pkg/local_object_storage/writecache.(*cache).runFlushLoop Signed-off-by: Roman Khimov --- pkg/local_object_storage/shard/shard_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/local_object_storage/shard/shard_test.go b/pkg/local_object_storage/shard/shard_test.go index d95b76f6ca..e2be2f8bf8 100644 --- a/pkg/local_object_storage/shard/shard_test.go +++ b/pkg/local_object_storage/shard/shard_test.go @@ -67,6 +67,7 @@ func newCustomShard(t testing.TB, rootPath string, enableWriteCache bool, wcOpts } opts := append([]shard.Option{ + shard.WithID(shard.NewIDFromBytes([]byte("testShard"))), shard.WithLogger(zap.L()), shard.WithBlobStorOptions(bsOpts...), shard.WithMetaBaseOptions( From faa5b67b6658f81c48def71e43caa2580d87cb21 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Sat, 9 Sep 2023 23:20:09 +0300 Subject: [PATCH 02/12] fstree: combined writes for small objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Writing small objects is a problem in that per-object overheads become comparable to the data write size. This is especially true for HDDs which prefer sequential writes as much as possible (but it's better for SSD block management as well). So for small objects we can try pushing them into a single file, but leaving hard links for each of them. This is what we have implemented here. The intention was to provide better results for 128K-1M range of objects since we already have Peapod for smaller ones and larger ones never were a problem. But in fact test results suggest that this approach works better than the original FSTree only in <128K range because at 256K it's slower. It at the same time outperforms Peapod in just about every case except 1B payloads which are purely synthetic (real objects have at least ~150 bytes overhead for various metadata). So the suggestion now it to replace Peapods with this mechanism eventually and simplify configuration (the only proper storage subsystem to configure). Caveats: * even though batching can be done in a more generic way we're heavily reliant on hard links, so it's at least Unix-only, but currently Linux is the primary target, so it's there only * decoder part is in the generic code since one can copy/paste (backup) shards to other systems and expect them to work * deletion is more fun now since the underlying file won't go away unless all hard links are deleted, this is expected to be OK for our use case * defaultTick is the same as for BoltDB for easy comparison * combinedSizeThresh was set to 8M for tests, but now lowered based on results * combinedSizeLimit/combinedCountLimit are a bit more empiric, but seem to be fine based on results Also tried: * O_DSYNC instead of fdatasync(), works fine for a single thread, but anything multithreaded slows down 10-20-fold, so not worth even an option * 32M batches packing more of 64K-1M objects, these just suck with 2-10x performance degradation * 2M batches packing less of 64K-1M objects, +30-80%% mostly with only one specific case improved by 16% (64K/20 threads) HDD results, original FSTree vs. combined: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 5 1600 Six-Core Processor │ hdd.fstree-original │ hdd.fstree-combined │ │ sec/op │ sec/op vs base │ Put/size=1,thread=1-12 32.16m ± 17% 42.07m ± 11% +30.83% (p=0.000 n=10) Put/size=1,thread=20-12 181.59m ± 21% 42.36m ± 6% -76.67% (p=0.000 n=10) Put/size=1,thread=100-12 261.72m ± 11% 52.95m ± 14% -79.77% (p=0.000 n=10) Put/size=1,thread=200-12 308.1m ± 13% 116.6m ± 9% -62.17% (p=0.000 n=10) Put/size=256,thread=1-12 39.63m ± 20% 45.50m ± 12% +14.83% (p=0.003 n=10) Put/size=256,thread=20-12 188.48m ± 17% 41.56m ± 7% -77.95% (p=0.000 n=10) Put/size=256,thread=100-12 254.06m ± 18% 51.91m ± 8% -79.57% (p=0.000 n=10) Put/size=256,thread=200-12 333.3m ± 18% 113.8m ± 8% -65.85% (p=0.000 n=10) Put/size=1024,thread=1-12 39.14m ± 20% 45.51m ± 9% +16.28% (p=0.000 n=10) Put/size=1024,thread=20-12 212.18m ± 14% 50.37m ± 15% -76.26% (p=0.000 n=10) Put/size=1024,thread=100-12 275.32m ± 9% 61.55m ± 13% -77.64% (p=0.000 n=10) Put/size=1024,thread=200-12 341.0m ± 18% 112.9m ± 18% -66.88% (p=0.000 n=10) Put/size=4096,thread=1-12 34.59m ± 15% 42.46m ± 17% +22.75% (p=0.000 n=10) Put/size=4096,thread=20-12 226.93m ± 14% 56.33m ± 10% -75.18% (p=0.000 n=10) Put/size=4096,thread=100-12 273.45m ± 8% 59.62m ± 9% -78.20% (p=0.000 n=10) Put/size=4096,thread=200-12 354.0m ± 16% 127.8m ± 9% -63.89% (p=0.000 n=10) Put/size=16384,thread=1-12 39.87m ± 3% 45.78m ± 19% +14.82% (p=0.000 n=10) Put/size=16384,thread=20-12 195.82m ± 30% 57.03m ± 8% -70.88% (p=0.000 n=10) Put/size=16384,thread=100-12 279.80m ± 15% 73.18m ± 5% -73.85% (p=0.000 n=10) Put/size=16384,thread=200-12 335.7m ± 20% 144.7m ± 11% -56.90% (p=0.000 n=10) Put/size=65536,thread=1-12 38.47m ± 8% 49.87m ± 7% +29.63% (p=0.000 n=10) Put/size=65536,thread=20-12 211.15m ± 12% 62.50m ± 8% -70.40% (p=0.000 n=10) Put/size=65536,thread=100-12 299.8m ± 14% 158.6m ± 7% -47.10% (p=0.000 n=10) Put/size=65536,thread=200-12 355.1m ± 117% 299.2m ± 8% -15.76% (p=0.000 n=10) Put/size=262144,thread=1-12 43.28m ± 20% 50.32m ± 11% +16.28% (p=0.001 n=10) Put/size=262144,thread=20-12 130.2m ± 11% 138.6m ± 9% ~ (p=0.123 n=10) Put/size=262144,thread=100-12 265.6m ± 18% 464.3m ± 11% +74.81% (p=0.000 n=10) Put/size=262144,thread=200-12 491.9m ± 10% 816.9m ± 12% +66.06% (p=0.000 n=10) Put/size=1048576,thread=1-12 42.59m ± 14% 50.79m ± 20% +19.25% (p=0.000 n=10) Put/size=1048576,thread=20-12 203.9m ± 9% 345.6m ± 14% +69.52% (p=0.000 n=10) Put/size=1048576,thread=100-12 855.1m ± 5% 1447.1m ± 4% +69.24% (p=0.000 n=10) Put/size=1048576,thread=200-12 1.650 ± 10% 2.726 ± 4% +65.23% (p=0.000 n=10) Put/size=4194304,thread=1-12 67.38m ± 12% 74.44m ± 9% +10.48% (p=0.035 n=10) Put/size=4194304,thread=20-12 745.9m ± 2% 1060.9m ± 2% +42.22% (p=0.000 n=10) Put/size=4194304,thread=100-12 3.794 ± 3% 4.952 ± 6% +30.52% (p=0.000 n=10) Put/size=4194304,thread=200-12 8.835 ± 30% 10.622 ± 9% ~ (p=0.123 n=10) geomean 218.6m 138.1m -36.84% │ hdd.fstree-original │ hdd.fstree-combined │ │ B/op │ B/op vs base │ Put/size=1,thread=1-12 2.430Ki ± 1% 2.812Ki ± 4% +15.71% (p=0.000 n=10) Put/size=1,thread=20-12 47.03Ki ± 7% 44.72Ki ± 1% -4.91% (p=0.000 n=10) Put/size=1,thread=100-12 224.7Ki ± 2% 206.1Ki ± 1% -8.28% (p=0.000 n=10) Put/size=1,thread=200-12 437.7Ki ± 2% 411.3Ki ± 1% -6.01% (p=0.000 n=10) Put/size=256,thread=1-12 2.433Ki ± 1% 2.813Ki ± 1% +15.63% (p=0.000 n=10) Put/size=256,thread=20-12 46.81Ki ± 1% 44.58Ki ± 1% -4.77% (p=0.000 n=10) Put/size=256,thread=100-12 226.5Ki ± 2% 207.1Ki ± 1% -8.55% (p=0.000 n=10) Put/size=256,thread=200-12 440.9Ki ± 2% 412.0Ki ± 2% -6.55% (p=0.000 n=10) Put/size=1024,thread=1-12 2.452Ki ± 1% 2.806Ki ± 2% +14.44% (p=0.000 n=10) Put/size=1024,thread=20-12 47.28Ki ± 1% 44.78Ki ± 1% -5.30% (p=0.000 n=10) Put/size=1024,thread=100-12 227.7Ki ± 1% 209.6Ki ± 0% -7.94% (p=0.000 n=10) Put/size=1024,thread=200-12 444.5Ki ± 1% 419.6Ki ± 1% -5.62% (p=0.000 n=10) Put/size=4096,thread=1-12 2.448Ki ± 1% 2.835Ki ± 0% +15.82% (p=0.000 n=10) Put/size=4096,thread=20-12 47.47Ki ± 1% 44.88Ki ± 0% -5.46% (p=0.000 n=10) Put/size=4096,thread=100-12 227.2Ki ± 3% 210.4Ki ± 0% -7.37% (p=0.000 n=10) Put/size=4096,thread=200-12 442.9Ki ± 4% 422.5Ki ± 1% -4.61% (p=0.000 n=10) Put/size=16384,thread=1-12 2.458Ki ± 1% 2.830Ki ± 2% +15.09% (p=0.000 n=10) Put/size=16384,thread=20-12 47.57Ki ± 1% 45.42Ki ± 1% -4.52% (p=0.000 n=10) Put/size=16384,thread=100-12 228.2Ki ± 2% 211.3Ki ± 2% -7.41% (p=0.000 n=10) Put/size=16384,thread=200-12 442.2Ki ± 4% 425.2Ki ± 1% -3.86% (p=0.000 n=10) Put/size=65536,thread=1-12 2.454Ki ± 2% 2.855Ki ± 3% +16.35% (p=0.000 n=10) Put/size=65536,thread=20-12 47.61Ki ± 1% 45.57Ki ± 1% -4.29% (p=0.000 n=10) Put/size=65536,thread=100-12 231.2Ki ± 2% 221.8Ki ± 1% -4.06% (p=0.000 n=10) Put/size=65536,thread=200-12 449.6Ki ± 2% 443.3Ki ± 1% -1.41% (p=0.000 n=10) Put/size=262144,thread=1-12 2.491Ki ± 2% 2.869Ki ± 2% +15.19% (p=0.000 n=10) Put/size=262144,thread=20-12 46.69Ki ± 1% 47.29Ki ± 1% +1.30% (p=0.000 n=10) Put/size=262144,thread=100-12 228.5Ki ± 1% 232.5Ki ± 1% +1.78% (p=0.002 n=10) Put/size=262144,thread=200-12 454.2Ki ± 2% 459.9Ki ± 1% ~ (p=0.052 n=10) Put/size=1048576,thread=1-12 2.506Ki ± 4% 2.911Ki ± 2% +16.17% (p=0.000 n=10) Put/size=1048576,thread=20-12 49.20Ki ± 5% 53.53Ki ± 6% +8.79% (p=0.000 n=10) Put/size=1048576,thread=100-12 245.1Ki ± 3% 249.3Ki ± 12% +1.72% (p=0.043 n=10) Put/size=1048576,thread=200-12 528.8Ki ± 5% 484.2Ki ± 1% -8.43% (p=0.002 n=10) Put/size=4194304,thread=1-12 2.582Ki ± 1% 2.913Ki ± 1% +12.82% (p=0.000 n=10) Put/size=4194304,thread=20-12 51.03Ki ± 1% 57.15Ki ± 1% +12.00% (p=0.000 n=10) Put/size=4194304,thread=100-12 248.3Ki ± 4% 259.1Ki ± 4% +4.35% (p=0.005 n=10) Put/size=4194304,thread=200-12 505.3Ki ± 2% 509.2Ki ± 1% ~ (p=0.165 n=10) geomean 59.58Ki 60.34Ki +1.27% │ hdd.fstree-original │ hdd.fstree-combined │ │ allocs/op │ allocs/op vs base │ Put/size=1,thread=1-12 27.00 ± 4% 33.50 ± 1% +24.07% (p=0.000 n=10) Put/size=1,thread=20-12 518.5 ± 1% 496.5 ± 1% -4.24% (p=0.000 n=10) Put/size=1,thread=100-12 2.537k ± 1% 2.284k ± 1% -9.97% (p=0.000 n=10) Put/size=1,thread=200-12 4.979k ± 1% 4.564k ± 1% -8.35% (p=0.000 n=10) Put/size=256,thread=1-12 27.00 ± 0% 33.50 ± 1% +24.07% (p=0.000 n=10) Put/size=256,thread=20-12 514.5 ± 1% 496.0 ± 1% -3.60% (p=0.000 n=10) Put/size=256,thread=100-12 2.547k ± 1% 2.289k ± 0% -10.13% (p=0.000 n=10) Put/size=256,thread=200-12 4.985k ± 2% 4.555k ± 2% -8.63% (p=0.000 n=10) Put/size=1024,thread=1-12 27.00 ± 4% 33.00 ± 3% +22.22% (p=0.000 n=10) Put/size=1024,thread=20-12 517.0 ± 1% 496.0 ± 1% -4.06% (p=0.000 n=10) Put/size=1024,thread=100-12 2.545k ± 0% 2.303k ± 1% -9.51% (p=0.000 n=10) Put/size=1024,thread=200-12 5.006k ± 1% 4.606k ± 1% -8.00% (p=0.000 n=10) Put/size=4096,thread=1-12 27.00 ± 0% 34.00 ± 0% +25.93% (p=0.000 n=10) Put/size=4096,thread=20-12 519.0 ± 1% 496.5 ± 1% -4.34% (p=0.000 n=10) Put/size=4096,thread=100-12 2.538k ± 3% 2.307k ± 0% -9.12% (p=0.000 n=10) Put/size=4096,thread=200-12 4.971k ± 4% 4.631k ± 1% -6.84% (p=0.000 n=10) Put/size=16384,thread=1-12 27.00 ± 4% 34.00 ± 3% +25.93% (p=0.000 n=10) Put/size=16384,thread=20-12 517.5 ± 1% 499.0 ± 1% -3.57% (p=0.000 n=10) Put/size=16384,thread=100-12 2.540k ± 2% 2.316k ± 2% -8.84% (p=0.000 n=10) Put/size=16384,thread=200-12 4.963k ± 3% 4.656k ± 1% -6.19% (p=0.000 n=10) Put/size=65536,thread=1-12 27.00 ± 4% 34.00 ± 3% +25.93% (p=0.000 n=10) Put/size=65536,thread=20-12 518.0 ± 1% 501.0 ± 1% -3.28% (p=0.000 n=10) Put/size=65536,thread=100-12 2.571k ± 2% 2.429k ± 1% -5.54% (p=0.000 n=10) Put/size=65536,thread=200-12 5.042k ± 2% 4.857k ± 0% -3.67% (p=0.000 n=10) Put/size=262144,thread=1-12 27.00 ± 4% 34.00 ± 3% +25.93% (p=0.000 n=10) Put/size=262144,thread=20-12 507.0 ± 1% 520.5 ± 1% +2.66% (p=0.000 n=10) Put/size=262144,thread=100-12 2.542k ± 1% 2.551k ± 1% ~ (p=0.171 n=10) Put/size=262144,thread=200-12 5.090k ± 2% 5.048k ± 1% ~ (p=0.105 n=10) Put/size=1048576,thread=1-12 28.00 ± 4% 34.00 ± 0% +21.43% (p=0.000 n=10) Put/size=1048576,thread=20-12 522.5 ± 1% 570.0 ± 2% +9.09% (p=0.000 n=10) Put/size=1048576,thread=100-12 2.701k ± 2% 2.754k ± 2% +1.96% (p=0.001 n=10) Put/size=1048576,thread=200-12 5.814k ± 4% 5.340k ± 1% -8.16% (p=0.001 n=10) Put/size=4194304,thread=1-12 28.50 ± 2% 34.00 ± 0% +19.30% (p=0.000 n=10) Put/size=4194304,thread=20-12 554.0 ± 1% 644.5 ± 1% +16.34% (p=0.000 n=10) Put/size=4194304,thread=100-12 2.755k ± 4% 2.938k ± 2% +6.62% (p=0.000 n=10) Put/size=4194304,thread=200-12 5.628k ± 2% 5.781k ± 0% +2.73% (p=0.002 n=10) geomean 659.6 677.9 +2.77% HDD results, Peapod vs. combined FSTree: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 5 1600 Six-Core Processor │ hdd.peapod │ hdd.fstree-combined │ │ sec/op │ sec/op vs base │ Put/size=1,thread=1-12 37.24m ± 13% 42.07m ± 11% +12.98% (p=0.015 n=10) Put/size=1,thread=20-12 81.66m ± 10% 42.36m ± 6% -48.12% (p=0.000 n=10) Put/size=1,thread=100-12 85.56m ± 16% 52.95m ± 14% -38.12% (p=0.000 n=10) Put/size=1,thread=200-12 85.10m ± 10% 116.56m ± 9% +36.96% (p=0.000 n=10) Put/size=256,thread=1-12 51.45m ± 9% 45.50m ± 12% -11.55% (p=0.001 n=10) Put/size=256,thread=20-12 87.50m ± 12% 41.56m ± 7% -52.51% (p=0.000 n=10) Put/size=256,thread=100-12 100.30m ± 12% 51.91m ± 8% -48.24% (p=0.000 n=10) Put/size=256,thread=200-12 108.0m ± 11% 113.8m ± 8% ~ (p=0.315 n=10) Put/size=1024,thread=1-12 63.41m ± 11% 45.51m ± 9% -28.23% (p=0.000 n=10) Put/size=1024,thread=20-12 88.01m ± 9% 50.37m ± 15% -42.77% (p=0.000 n=10) Put/size=1024,thread=100-12 107.36m ± 9% 61.55m ± 13% -42.67% (p=0.000 n=10) Put/size=1024,thread=200-12 120.9m ± 11% 112.9m ± 18% ~ (p=0.165 n=10) Put/size=4096,thread=1-12 70.93m ± 12% 42.46m ± 17% -40.14% (p=0.000 n=10) Put/size=4096,thread=20-12 100.68m ± 15% 56.33m ± 10% -44.06% (p=0.000 n=10) Put/size=4096,thread=100-12 129.19m ± 7% 59.62m ± 9% -53.85% (p=0.000 n=10) Put/size=4096,thread=200-12 158.4m ± 2% 127.8m ± 9% -19.28% (p=0.000 n=10) Put/size=16384,thread=1-12 74.06m ± 18% 45.78m ± 19% -38.19% (p=0.000 n=10) Put/size=16384,thread=20-12 117.52m ± 9% 57.03m ± 8% -51.47% (p=0.000 n=10) Put/size=16384,thread=100-12 188.78m ± 9% 73.18m ± 5% -61.24% (p=0.000 n=10) Put/size=16384,thread=200-12 242.2m ± 13% 144.7m ± 11% -40.27% (p=0.000 n=10) Put/size=65536,thread=1-12 83.27m ± 5% 49.87m ± 7% -40.10% (p=0.000 n=10) Put/size=65536,thread=20-12 164.54m ± 12% 62.50m ± 8% -62.01% (p=0.000 n=10) Put/size=65536,thread=100-12 342.2m ± 11% 158.6m ± 7% -53.66% (p=0.000 n=10) Put/size=65536,thread=200-12 392.7m ± 14% 299.2m ± 8% -23.81% (p=0.000 n=10) Put/size=262144,thread=1-12 79.08m ± 19% 50.32m ± 11% -36.36% (p=0.000 n=10) Put/size=262144,thread=20-12 226.6m ± 11% 138.6m ± 9% -38.83% (p=0.000 n=10) Put/size=262144,thread=100-12 492.3m ± 5% 464.3m ± 11% -5.68% (p=0.015 n=10) Put/size=262144,thread=200-12 707.5m ± 6% 816.9m ± 12% +15.45% (p=0.000 n=10) Put/size=1048576,thread=1-12 50.79m ± 20% Put/size=1048576,thread=20-12 345.6m ± 14% Put/size=1048576,thread=100-12 1.447 ± 4% Put/size=1048576,thread=200-12 2.726 ± 4% Put/size=4194304,thread=1-12 74.44m ± 9% Put/size=4194304,thread=20-12 1.061 ± 2% Put/size=4194304,thread=100-12 4.952 ± 6% Put/size=4194304,thread=200-12 10.62 ± 9% geomean 125.0m 138.1m -34.43% │ hdd.peapod │ hdd.fstree-combined │ │ B/op │ B/op vs base │ Put/size=1,thread=1-12 11.890Ki ± 1% 2.812Ki ± 4% -76.35% (p=0.000 n=10) Put/size=1,thread=20-12 51.05Ki ± 22% 44.72Ki ± 1% ~ (p=0.143 n=10) Put/size=1,thread=100-12 213.6Ki ± 3% 206.1Ki ± 1% -3.52% (p=0.004 n=10) Put/size=1,thread=200-12 440.7Ki ± 13% 411.3Ki ± 1% ~ (p=0.143 n=10) Put/size=256,thread=1-12 9.207Ki ± 1% 2.813Ki ± 1% -69.44% (p=0.000 n=10) Put/size=256,thread=20-12 43.71Ki ± 13% 44.58Ki ± 1% ~ (p=0.481 n=10) Put/size=256,thread=100-12 239.7Ki ± 8% 207.1Ki ± 1% -13.62% (p=0.000 n=10) Put/size=256,thread=200-12 450.8Ki ± 4% 412.0Ki ± 2% -8.60% (p=0.002 n=10) Put/size=1024,thread=1-12 12.169Ki ± 2% 2.806Ki ± 2% -76.94% (p=0.000 n=10) Put/size=1024,thread=20-12 114.50Ki ± 4% 44.78Ki ± 1% -60.89% (p=0.000 n=10) Put/size=1024,thread=100-12 570.2Ki ± 3% 209.6Ki ± 0% -63.25% (p=0.000 n=10) Put/size=1024,thread=200-12 1088.3Ki ± 6% 419.6Ki ± 1% -61.45% (p=0.000 n=10) Put/size=4096,thread=1-12 32.011Ki ± 4% 2.835Ki ± 0% -91.14% (p=0.000 n=10) Put/size=4096,thread=20-12 463.28Ki ± 6% 44.88Ki ± 0% -90.31% (p=0.000 n=10) Put/size=4096,thread=100-12 2155.6Ki ± 3% 210.4Ki ± 0% -90.24% (p=0.000 n=10) Put/size=4096,thread=200-12 4317.9Ki ± 2% 422.5Ki ± 1% -90.21% (p=0.000 n=10) Put/size=16384,thread=1-12 90.904Ki ± 2% 2.830Ki ± 2% -96.89% (p=0.000 n=10) Put/size=16384,thread=20-12 1528.41Ki ± 5% 45.42Ki ± 1% -97.03% (p=0.000 n=10) Put/size=16384,thread=100-12 7272.0Ki ± 2% 211.3Ki ± 2% -97.09% (p=0.000 n=10) Put/size=16384,thread=200-12 14619.0Ki ± 4% 425.2Ki ± 1% -97.09% (p=0.000 n=10) Put/size=65536,thread=1-12 327.574Ki ± 2% 2.855Ki ± 3% -99.13% (p=0.000 n=10) Put/size=65536,thread=20-12 5368.91Ki ± 4% 45.57Ki ± 1% -99.15% (p=0.000 n=10) Put/size=65536,thread=100-12 27878.2Ki ± 3% 221.8Ki ± 1% -99.20% (p=0.000 n=10) Put/size=65536,thread=200-12 61247.9Ki ± 4% 443.3Ki ± 1% -99.28% (p=0.000 n=10) Put/size=262144,thread=1-12 1244.865Ki ± 1% 2.869Ki ± 2% -99.77% (p=0.000 n=10) Put/size=262144,thread=20-12 20783.90Ki ± 8% 47.29Ki ± 1% -99.77% (p=0.000 n=10) Put/size=262144,thread=100-12 125171.4Ki ± 1% 232.5Ki ± 1% -99.81% (p=0.000 n=10) Put/size=262144,thread=200-12 319065.4Ki ± 1% 459.9Ki ± 1% -99.86% (p=0.000 n=10) Put/size=1048576,thread=1-12 2.911Ki ± 2% Put/size=1048576,thread=20-12 53.53Ki ± 6% Put/size=1048576,thread=100-12 249.3Ki ± 12% Put/size=1048576,thread=200-12 484.2Ki ± 1% Put/size=4194304,thread=1-12 2.913Ki ± 1% Put/size=4194304,thread=20-12 57.15Ki ± 1% Put/size=4194304,thread=100-12 259.1Ki ± 4% Put/size=4194304,thread=200-12 509.2Ki ± 1% geomean 857.7Ki 60.34Ki -93.18% │ hdd.peapod │ hdd.fstree-combined │ │ allocs/op │ allocs/op vs base │ Put/size=1,thread=1-12 51.00 ± 2% 33.50 ± 1% -34.31% (p=0.000 n=10) Put/size=1,thread=20-12 188.0 ± 4% 496.5 ± 1% +164.10% (p=0.000 n=10) Put/size=1,thread=100-12 942.5 ± 17% 2284.0 ± 1% +142.33% (p=0.000 n=10) Put/size=1,thread=200-12 2.150k ± 10% 4.564k ± 1% +112.26% (p=0.000 n=10) Put/size=256,thread=1-12 57.00 ± 0% 33.50 ± 1% -41.23% (p=0.000 n=10) Put/size=256,thread=20-12 233.5 ± 12% 496.0 ± 1% +112.42% (p=0.000 n=10) Put/size=256,thread=100-12 1.278k ± 8% 2.289k ± 0% +79.11% (p=0.000 n=10) Put/size=256,thread=200-12 2.565k ± 6% 4.555k ± 2% +77.58% (p=0.000 n=10) Put/size=1024,thread=1-12 61.00 ± 0% 33.00 ± 3% -45.90% (p=0.000 n=10) Put/size=1024,thread=20-12 331.0 ± 4% 496.0 ± 1% +49.85% (p=0.000 n=10) Put/size=1024,thread=100-12 1.823k ± 4% 2.303k ± 1% +26.30% (p=0.000 n=10) Put/size=1024,thread=200-12 3.783k ± 2% 4.606k ± 1% +21.77% (p=0.000 n=10) Put/size=4096,thread=1-12 63.00 ± 2% 34.00 ± 0% -46.03% (p=0.000 n=10) Put/size=4096,thread=20-12 369.0 ± 5% 496.5 ± 1% +34.55% (p=0.000 n=10) Put/size=4096,thread=100-12 1.819k ± 6% 2.307k ± 0% +26.83% (p=0.000 n=10) Put/size=4096,thread=200-12 3.825k ± 4% 4.631k ± 1% +21.07% (p=0.000 n=10) Put/size=16384,thread=1-12 66.00 ± 2% 34.00 ± 3% -48.48% (p=0.000 n=10) Put/size=16384,thread=20-12 419.0 ± 4% 499.0 ± 1% +19.09% (p=0.000 n=10) Put/size=16384,thread=100-12 1.954k ± 1% 2.316k ± 2% +18.47% (p=0.000 n=10) Put/size=16384,thread=200-12 3.827k ± 3% 4.656k ± 1% +21.66% (p=0.000 n=10) Put/size=65536,thread=1-12 73.00 ± 1% 34.00 ± 3% -53.42% (p=0.000 n=10) Put/size=65536,thread=20-12 452.0 ± 4% 501.0 ± 1% +10.84% (p=0.000 n=10) Put/size=65536,thread=100-12 2.035k ± 2% 2.429k ± 1% +19.36% (p=0.000 n=10) Put/size=65536,thread=200-12 3.904k ± 1% 4.857k ± 0% +24.43% (p=0.000 n=10) Put/size=262144,thread=1-12 84.00 ± 1% 34.00 ± 3% -59.52% (p=0.000 n=10) Put/size=262144,thread=20-12 459.0 ± 4% 520.5 ± 1% +13.40% (p=0.000 n=10) Put/size=262144,thread=100-12 1.964k ± 2% 2.551k ± 1% +29.91% (p=0.000 n=10) Put/size=262144,thread=200-12 4.268k ± 2% 5.048k ± 1% +18.29% (p=0.000 n=10) Put/size=1048576,thread=1-12 34.00 ± 0% Put/size=1048576,thread=20-12 570.0 ± 2% Put/size=1048576,thread=100-12 2.754k ± 2% Put/size=1048576,thread=200-12 5.340k ± 1% Put/size=4194304,thread=1-12 34.00 ± 0% Put/size=4194304,thread=20-12 644.5 ± 1% Put/size=4194304,thread=100-12 2.938k ± 2% Put/size=4194304,thread=200-12 5.781k ± 0% geomean 587.1 677.9 +11.94% SSD results, Peapod vs. combined FSTree: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics │ ssd.peapod │ ssd.fstree-combined │ │ sec/op │ sec/op vs base │ Put/size=1,thread=1-16 10.65m ± 1% 14.04m ± 1% +31.77% (p=0.000 n=10) Put/size=1,thread=20-16 10.59m ± 0% 15.09m ± 2% +42.49% (p=0.000 n=10) Put/size=1,thread=100-16 10.46m ± 1% 17.31m ± 3% +65.42% (p=0.000 n=10) Put/size=1,thread=200-16 11.62m ± 85% 33.77m ± 2% +190.61% (p=0.000 n=10) Put/size=256,thread=1-16 10.95m ± 20% 14.02m ± 1% +27.97% (p=0.000 n=10) Put/size=256,thread=20-16 11.72m ± 6% 15.58m ± 2% +32.99% (p=0.000 n=10) Put/size=256,thread=100-16 22.24m ± 12% 16.87m ± 6% -24.15% (p=0.000 n=10) Put/size=256,thread=200-16 26.05m ± 2% 29.55m ± 2% +13.44% (p=0.000 n=10) Put/size=1024,thread=1-16 10.72m ± 10% 12.30m ± 0% +14.76% (p=0.000 n=10) Put/size=1024,thread=20-16 19.92m ± 47% 13.86m ± 2% ~ (p=0.481 n=10) Put/size=1024,thread=100-16 26.05m ± 1% 15.69m ± 2% -39.75% (p=0.000 n=10) Put/size=1024,thread=200-16 28.87m ± 2% 30.10m ± 1% +4.28% (p=0.000 n=10) Put/size=4096,thread=1-16 12.49m ± 20% 12.86m ± 1% ~ (p=0.481 n=10) Put/size=4096,thread=20-16 21.47m ± 4% 14.14m ± 2% -34.15% (p=0.000 n=10) Put/size=4096,thread=100-16 19.41m ± 42% 16.10m ± 2% -17.07% (p=0.000 n=10) Put/size=4096,thread=200-16 33.15m ± 1% 31.72m ± 2% -4.30% (p=0.002 n=10) Put/size=16384,thread=1-16 12.07m ± 17% 12.89m ± 1% ~ (p=0.143 n=10) Put/size=16384,thread=20-16 24.64m ± 2% 14.59m ± 1% -40.78% (p=0.000 n=10) Put/size=16384,thread=100-16 33.66m ± 3% 17.60m ± 2% -47.72% (p=0.000 n=10) Put/size=16384,thread=200-16 43.27m ± 2% 35.42m ± 1% -18.15% (p=0.000 n=10) Put/size=65536,thread=1-16 10.84m ± 84% 13.11m ± 1% ~ (p=0.353 n=10) Put/size=65536,thread=20-16 28.73m ± 3% 16.55m ± 2% -42.39% (p=0.000 n=10) Put/size=65536,thread=100-16 50.44m ± 3% 23.58m ± 7% -53.25% (p=0.000 n=10) Put/size=65536,thread=200-16 73.42m ± 4% 42.20m ± 2% -42.51% (p=0.000 n=10) Put/size=262144,thread=1-16 23.70m ± 5% 13.67m ± 4% -42.30% (p=0.000 n=10) Put/size=262144,thread=20-16 39.23m ± 3% 18.69m ± 4% -52.36% (p=0.000 n=10) Put/size=262144,thread=100-16 112.04m ± 5% 68.14m ± 2% -39.19% (p=0.000 n=10) Put/size=262144,thread=200-16 192.9m ± 7% 113.5m ± 23% -41.17% (p=0.000 n=10) Put/size=1048576,thread=1-16 17.17m ± 1% Put/size=1048576,thread=20-16 40.61m ± 18% Put/size=1048576,thread=100-16 197.9m ± 3% Put/size=1048576,thread=200-16 353.1m ± 4% Put/size=4194304,thread=1-16 20.74m ± 1% Put/size=4194304,thread=20-16 143.2m ± 9% Put/size=4194304,thread=100-16 700.5m ± 4% Put/size=4194304,thread=200-16 1.356 ± 10% geomean 23.62m 31.30m -13.41% │ ssd.peapod │ ssd.fstree-combined │ │ B/op │ B/op vs base │ Put/size=1,thread=1-16 13.366Ki ± 0% 2.678Ki ± 1% -79.96% (p=0.000 n=10) Put/size=1,thread=20-16 119.02Ki ± 2% 42.40Ki ± 1% -64.38% (p=0.000 n=10) Put/size=1,thread=100-16 580.7Ki ± 1% 194.3Ki ± 0% -66.54% (p=0.000 n=10) Put/size=1,thread=200-16 1153.1Ki ± 27% 389.5Ki ± 0% -66.22% (p=0.000 n=10) Put/size=256,thread=1-16 9.384Ki ± 1% 2.703Ki ± 0% -71.20% (p=0.000 n=10) Put/size=256,thread=20-16 72.47Ki ± 3% 42.44Ki ± 0% -41.45% (p=0.000 n=10) Put/size=256,thread=100-16 305.2Ki ± 2% 195.0Ki ± 1% -36.11% (p=0.000 n=10) Put/size=256,thread=200-16 577.8Ki ± 1% 389.8Ki ± 1% -32.54% (p=0.000 n=10) Put/size=1024,thread=1-16 13.377Ki ± 1% 2.699Ki ± 0% -79.83% (p=0.000 n=10) Put/size=1024,thread=20-16 147.66Ki ± 8% 42.43Ki ± 1% -71.26% (p=0.000 n=10) Put/size=1024,thread=100-16 639.4Ki ± 1% 196.5Ki ± 1% -69.27% (p=0.000 n=10) Put/size=1024,thread=200-16 1250.3Ki ± 1% 392.4Ki ± 1% -68.61% (p=0.000 n=10) Put/size=4096,thread=1-16 32.826Ki ± 2% 2.696Ki ± 1% -91.79% (p=0.000 n=10) Put/size=4096,thread=20-16 484.87Ki ± 2% 42.28Ki ± 1% -91.28% (p=0.000 n=10) Put/size=4096,thread=100-16 2416.2Ki ± 3% 196.8Ki ± 0% -91.85% (p=0.000 n=10) Put/size=4096,thread=200-16 4659.8Ki ± 1% 392.8Ki ± 0% -91.57% (p=0.000 n=10) Put/size=16384,thread=1-16 90.928Ki ± 1% 2.707Ki ± 0% -97.02% (p=0.000 n=10) Put/size=16384,thread=20-16 1539.51Ki ± 1% 42.83Ki ± 1% -97.22% (p=0.000 n=10) Put/size=16384,thread=100-16 7535.5Ki ± 1% 198.0Ki ± 0% -97.37% (p=0.000 n=10) Put/size=16384,thread=200-16 14926.7Ki ± 1% 395.3Ki ± 0% -97.35% (p=0.000 n=10) Put/size=65536,thread=1-16 302.934Ki ± 1% 2.715Ki ± 0% -99.10% (p=0.000 n=10) Put/size=65536,thread=20-16 5398.51Ki ± 1% 43.22Ki ± 0% -99.20% (p=0.000 n=10) Put/size=65536,thread=100-16 26256.7Ki ± 1% 200.1Ki ± 1% -99.24% (p=0.000 n=10) Put/size=65536,thread=200-16 52366.5Ki ± 1% 400.0Ki ± 0% -99.24% (p=0.000 n=10) Put/size=262144,thread=1-16 1169.852Ki ± 2% 2.725Ki ± 1% -99.77% (p=0.000 n=10) Put/size=262144,thread=20-16 20540.44Ki ± 2% 43.49Ki ± 0% -99.79% (p=0.000 n=10) Put/size=262144,thread=100-16 99619.5Ki ± 2% 214.4Ki ± 1% -99.78% (p=0.000 n=10) Put/size=262144,thread=200-16 207670.7Ki ± 3% 422.7Ki ± 2% -99.80% (p=0.000 n=10) Put/size=1048576,thread=1-16 2.770Ki ± 1% Put/size=1048576,thread=20-16 46.23Ki ± 5% Put/size=1048576,thread=100-16 229.8Ki ± 2% Put/size=1048576,thread=200-16 460.2Ki ± 1% Put/size=4194304,thread=1-16 2.771Ki ± 1% Put/size=4194304,thread=20-16 50.57Ki ± 1% Put/size=4194304,thread=100-16 257.8Ki ± 3% Put/size=4194304,thread=200-16 509.5Ki ± 11% geomean 984.0Ki 56.68Ki -94.41% │ ssd.peapod │ ssd.fstree-combined │ │ allocs/op │ allocs/op vs base │ Put/size=1,thread=1-16 55.00 ± 0% 32.00 ± 3% -41.82% (p=0.000 n=10) Put/size=1,thread=20-16 241.0 ± 1% 473.5 ± 1% +96.47% (p=0.000 n=10) Put/size=1,thread=100-16 1.214k ± 3% 2.167k ± 0% +78.39% (p=0.000 n=10) Put/size=1,thread=200-16 2.806k ± 20% 4.338k ± 0% +54.62% (p=0.000 n=10) Put/size=256,thread=1-16 58.00 ± 0% 32.00 ± 0% -44.83% (p=0.000 n=10) Put/size=256,thread=20-16 312.5 ± 3% 473.5 ± 0% +51.52% (p=0.000 n=10) Put/size=256,thread=100-16 1.607k ± 3% 2.163k ± 0% +34.57% (p=0.000 n=10) Put/size=256,thread=200-16 3.374k ± 2% 4.319k ± 0% +28.01% (p=0.000 n=10) Put/size=1024,thread=1-16 61.00 ± 0% 32.00 ± 0% -47.54% (p=0.000 n=10) Put/size=1024,thread=20-16 495.5 ± 14% 470.0 ± 1% ~ (p=0.467 n=10) Put/size=1024,thread=100-16 2.308k ± 2% 2.160k ± 0% -6.41% (p=0.000 n=10) Put/size=1024,thread=200-16 4.651k ± 3% 4.319k ± 0% -7.12% (p=0.000 n=10) Put/size=4096,thread=1-16 64.00 ± 0% 32.00 ± 0% -50.00% (p=0.000 n=10) Put/size=4096,thread=20-16 463.0 ± 3% 469.0 ± 0% +1.30% (p=0.004 n=10) Put/size=4096,thread=100-16 2.469k ± 7% 2.162k ± 0% -12.42% (p=0.000 n=10) Put/size=4096,thread=200-16 4.593k ± 1% 4.319k ± 0% -5.98% (p=0.000 n=10) Put/size=16384,thread=1-16 68.00 ± 1% 32.00 ± 0% -52.94% (p=0.000 n=10) Put/size=16384,thread=20-16 497.5 ± 1% 471.0 ± 0% -5.33% (p=0.000 n=10) Put/size=16384,thread=100-16 2.301k ± 1% 2.173k ± 0% -5.58% (p=0.000 n=10) Put/size=16384,thread=200-16 4.476k ± 2% 4.340k ± 0% -3.04% (p=0.000 n=10) Put/size=65536,thread=1-16 77.00 ± 1% 32.00 ± 0% -58.44% (p=0.000 n=10) Put/size=65536,thread=20-16 514.5 ± 1% 476.5 ± 1% -7.39% (p=0.000 n=10) Put/size=65536,thread=100-16 2.261k ± 2% 2.197k ± 1% -2.81% (p=0.000 n=10) Put/size=65536,thread=200-16 4.367k ± 2% 4.389k ± 0% ~ (p=0.342 n=10) Put/size=262144,thread=1-16 86.00 ± 1% 32.00 ± 0% -62.79% (p=0.000 n=10) Put/size=262144,thread=20-16 556.5 ± 4% 478.5 ± 0% -14.02% (p=0.000 n=10) Put/size=262144,thread=100-16 2.231k ± 2% 2.354k ± 1% +5.54% (p=0.000 n=10) Put/size=262144,thread=200-16 4.123k ± 2% 4.641k ± 2% +12.54% (p=0.000 n=10) Put/size=1048576,thread=1-16 33.00 ± 3% Put/size=1048576,thread=20-16 509.5 ± 2% Put/size=1048576,thread=100-16 2.535k ± 1% Put/size=1048576,thread=200-16 5.062k ± 1% Put/size=4194304,thread=1-16 33.00 ± 0% Put/size=4194304,thread=20-16 573.0 ± 1% Put/size=4194304,thread=100-16 2.914k ± 3% Put/size=4194304,thread=200-16 5.767k ± 10% geomean 688.6 638.2 -10.30% Signed-off-by: Roman Khimov --- .../blobstor/fstree/control.go | 10 +- .../blobstor/fstree/fstree.go | 83 +++++++- .../blobstor/fstree/fstree_write_generic.go | 11 +- .../blobstor/fstree/fstree_write_linux.go | 195 ++++++++++++++++-- .../blobstor/fstree/fstree_write_specific.go | 2 +- 5 files changed, 271 insertions(+), 30 deletions(-) diff --git a/pkg/local_object_storage/blobstor/fstree/control.go b/pkg/local_object_storage/blobstor/fstree/control.go index dedf334752..243151d9f6 100644 --- a/pkg/local_object_storage/blobstor/fstree/control.go +++ b/pkg/local_object_storage/blobstor/fstree/control.go @@ -19,13 +19,15 @@ func (t *FSTree) Init() error { return fmt.Errorf("mkdir all for %q: %w", t.RootPath, err) } if !t.readOnly { - f := newSpecificWriteData(t.RootPath, t.Permissions, t.noSync) - if f != nil { - t.writeData = f + var w = newSpecificWriter(t.RootPath, t.Permissions, t.noSync) + if w != nil { + t.writer = w } } return nil } // Close implements common.Storage. -func (*FSTree) Close() error { return nil } +func (t *FSTree) Close() error { + return t.writer.finalize() +} diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index 5c24756ede..84158ced3b 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -1,7 +1,9 @@ package fstree import ( + "bytes" "crypto/sha256" + "encoding/binary" "errors" "fmt" "io" @@ -28,7 +30,7 @@ type FSTree struct { *compression.Config Depth uint64 DirNameLen int - writeData func(string, []byte) error + writer writer noSync bool readOnly bool @@ -43,11 +45,23 @@ type Info struct { RootPath string } +// writer is an internal FS writing interface. +type writer interface { + writeData(oid.ID, string, []byte) error + finalize() error +} + const ( // DirNameLen is how many bytes is used to group keys into directories. DirNameLen = 1 // in bytes // MaxDepth is maximum depth of nested directories. MaxDepth = (sha256.Size - 1) / DirNameLen + + // combinedPrefix is the prefix that Protobuf message can't start with, + // it reads as "field number 15 of type 7", but there is no type 7 in + // the system (and we usually don't have 15 fields). ZSTD magic is also + // different. + combinedPrefix = 0x7f ) var _ common.Storage = (*FSTree)(nil) @@ -65,7 +79,7 @@ func New(opts ...Option) *FSTree { for i := range opts { opts[i](f) } - f.writeData = newGenericWriteData(f.Permissions, f.noSync) + f.writer = newGenericWriter(f.Permissions, f.noSync) return f } @@ -141,11 +155,13 @@ func (t *FSTree) iterate(depth uint64, curPath []string, prm common.IteratePrm) if prm.LazyHandler != nil { err = prm.LazyHandler(*addr, func() ([]byte, error) { data, err := os.ReadFile(filepath.Join(curPath...)) - if err != nil && errors.Is(err, fs.ErrNotExist) { - return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) + } + return nil, err } - - return data, err + return extractCombinedObject(addr.Object(), data) }) } else { var data []byte @@ -155,7 +171,10 @@ func (t *FSTree) iterate(depth uint64, curPath []string, prm common.IteratePrm) continue } if err == nil { - data, err = t.Decompress(data) + data, err = extractCombinedObject(addr.Object(), data) + if err == nil { + data, err = t.Decompress(data) + } } if err != nil { if prm.IgnoreErrors { @@ -266,7 +285,7 @@ func (t *FSTree) Put(prm common.PutPrm) (common.PutRes, error) { if !prm.DontCompress { prm.RawData = t.Compress(prm.RawData) } - err := t.writeData(p, prm.RawData) + err := t.writer.writeData(prm.Address.Object(), p, prm.RawData) if err != nil { return common.PutRes{}, fmt.Errorf("write object data into file %q: %w", p, err) } @@ -285,7 +304,13 @@ func (t *FSTree) Get(prm common.GetPrm) (common.GetRes, error) { if err != nil { return common.GetRes{}, fmt.Errorf("read file %q: %w", p, err) } - + data, err = extractCombinedObject(prm.Address.Object(), data) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return common.GetRes{}, logicerr.Wrap(apistatus.ObjectNotFound{}) + } + return common.GetRes{}, fmt.Errorf("extract object from %q: %w", p, err) + } data, err = t.Decompress(data) if err != nil { return common.GetRes{}, fmt.Errorf("decompress file data %q: %w", p, err) @@ -334,6 +359,14 @@ func (t *FSTree) GetBytes(addr oid.Address) ([]byte, error) { return nil, fmt.Errorf("read all %d bytes from object file %q: %w", sz, p, err) } + b, err = extractCombinedObject(addr.Object(), b) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) + } + return nil, fmt.Errorf("extract object from %q: %w", p, err) + } + if !t.IsCompressed(b) { return b, nil } @@ -346,6 +379,38 @@ func (t *FSTree) GetBytes(addr oid.Address) ([]byte, error) { return dec, nil } +func extractCombinedObject(id oid.ID, data []byte) ([]byte, error) { + const ( + prefixSize = 1 + idSize = sha256.Size + lengthSize = 4 + + idOff = prefixSize + lengthOff = idOff + idSize + dataOff = lengthOff + lengthSize + ) + + var notFound bool + + for len(data) > dataOff && data[0] == combinedPrefix { + notFound = true // The file _is_ combined, so the object _must_ be there. + var l = binary.BigEndian.Uint32(data[lengthOff:dataOff]) + if bytes.Equal(data[idOff:lengthOff], id[:]) { + data = data[dataOff : dataOff+int(l)] + notFound = false + break + } + if len(data) < dataOff+int(l) { + break + } + data = data[dataOff+int(l):] + } + if notFound { + return nil, fs.ErrNotExist // Quite similar in meaning. + } + return data, nil +} + // GetRange implements common.Storage. func (t *FSTree) GetRange(prm common.GetRangePrm) (common.GetRangeRes, error) { res, err := t.Get(common.GetPrm{Address: prm.Address}) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go index 06bd1de587..8c85b1c671 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go @@ -9,6 +9,7 @@ import ( "syscall" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" + oid "github.com/nspcc-dev/neofs-sdk-go/object/id" ) type genericWriter struct { @@ -16,7 +17,7 @@ type genericWriter struct { flags int } -func newGenericWriteData(perm fs.FileMode, noSync bool) func(string, []byte) error { +func newGenericWriter(perm fs.FileMode, noSync bool) writer { flags := os.O_WRONLY | os.O_CREATE | os.O_TRUNC | os.O_EXCL if !noSync { flags |= os.O_SYNC @@ -25,10 +26,14 @@ func newGenericWriteData(perm fs.FileMode, noSync bool) func(string, []byte) err perm: perm, flags: flags, } - return w.writeData + return w } -func (w *genericWriter) writeData(p string, data []byte) error { +func (w *genericWriter) finalize() error { + return nil +} + +func (w *genericWriter) writeData(_ oid.ID, p string, data []byte) error { // Here is a situation: // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.161Z info log/log.go:13 local object storage operation {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "address": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "op": "PUT", "type": "fstree", "storage_id": ""} // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.183Z info log/log.go:13 local object storage operation {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "address": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "op": "metabase PUT"} diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go index 98ae286c56..42f4a22e0f 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go @@ -3,23 +3,52 @@ package fstree import ( + "encoding/binary" "errors" "fmt" "io/fs" "strconv" + "sync" + "time" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" + oid "github.com/nspcc-dev/neofs-sdk-go/object/id" "golang.org/x/sys/unix" ) +const ( + defaultTick = 10 * time.Millisecond + combinedSizeThresh = 128 * 1024 + combinedSizeLimit = 8 * 1024 * 1024 + combinedCountLimit = 128 +) + type linuxWriter struct { - root string - perm uint32 - flags int + root string + perm uint32 + flags int + bFlags int + noSync bool + + batchLock sync.Mutex + batch *syncBatch } -func newSpecificWriteData(root string, perm fs.FileMode, noSync bool) func(string, []byte) error { +type syncBatch struct { + lock sync.Mutex + fd int + procname string + cnt int + size int + noSync bool + timer *time.Timer + ready chan struct{} + err error +} + +func newSpecificWriter(root string, perm fs.FileMode, noSync bool) writer { flags := unix.O_WRONLY | unix.O_TMPFILE | unix.O_CLOEXEC + bFlags := flags if !noSync { flags |= unix.O_DSYNC } @@ -29,19 +58,159 @@ func newSpecificWriteData(root string, perm fs.FileMode, noSync bool) func(strin } _ = unix.Close(fd) // Don't care about error. w := &linuxWriter{ - root: root, - perm: uint32(perm), - flags: flags, + root: root, + perm: uint32(perm), + flags: flags, + bFlags: bFlags, + noSync: noSync, + } + return w +} + +func (w *linuxWriter) newSyncBatch() (*syncBatch, error) { + fd, err := unix.Open(w.root, w.bFlags, w.perm) + if err != nil { + return nil, err + } + sb := &syncBatch{ + fd: fd, + procname: "/proc/self/fd/" + strconv.FormatUint(uint64(fd), 10), + ready: make(chan struct{}), + noSync: w.noSync, + } + sb.lock.Lock() + sb.timer = time.AfterFunc(defaultTick, sb.sync) + return sb, nil +} + +func (b *syncBatch) sync() { + b.lock.Lock() + defer b.lock.Unlock() + + select { + case <-b.ready: + return + default: } - return w.writeData + b.intSync() } -func (w *linuxWriter) writeData(p string, data []byte) error { - err := w.writeFile(p, data) - if errors.Is(err, unix.ENOSPC) { - return common.ErrNoSpace +func (b *syncBatch) intSync() { + var err error + + if b.err == nil && !b.noSync { + err = unix.Fdatasync(b.fd) + if err != nil { + b.err = err + } + } + + err = unix.Close(b.fd) + if b.err == nil && err != nil { + b.err = err + } + close(b.ready) + _ = b.timer.Stop() // True is stopped, but false is "AfterFunc already running". +} + +func (b *syncBatch) wait() error { + <-b.ready + return b.err +} + +func (b *syncBatch) write(id oid.ID, p string, data []byte) error { + var ( + err error + pref [1 + len(id) + 4]byte + ) + pref[0] = combinedPrefix + copy(pref[1:], id[:]) + binary.BigEndian.PutUint32(pref[1+len(id):], uint32(len(data))) + + n, err := unix.Writev(b.fd, [][]byte{pref[:], data}) + if err != nil { + b.err = err + b.intSync() + return err + } + if n != len(pref)+len(data) { + b.err = errors.New("incomplete write") + b.intSync() + return b.err + } + b.size += n + b.cnt++ + err = unix.Linkat(unix.AT_FDCWD, b.procname, unix.AT_FDCWD, p, unix.AT_SYMLINK_FOLLOW) + if err != nil { + if errors.Is(err, unix.EEXIST) { + // https://github.com/nspcc-dev/neofs-node/issues/2563 + return nil + } + b.err = err + b.intSync() + return b.err + } + return nil +} + +func (w *linuxWriter) finalize() error { + w.batchLock.Lock() + defer w.batchLock.Unlock() + if w.batch != nil { + w.batch.sync() + w.batch = nil + } + return nil +} + +func (w *linuxWriter) writeData(id oid.ID, p string, data []byte) error { + var err error + if len(data) > combinedSizeThresh { + err = w.writeFile(p, data) + } else { + err = w.writeCombinedFile(id, p, data) + } + if err != nil { + if errors.Is(err, unix.ENOSPC) { + return common.ErrNoSpace + } + return err + } + return nil +} + +func (w *linuxWriter) writeCombinedFile(id oid.ID, p string, data []byte) error { + var err error + var sb *syncBatch + + w.batchLock.Lock() + if w.batch == nil { + w.batch, err = w.newSyncBatch() + sb = w.batch + } else { + sb = w.batch + sb.lock.Lock() + select { + case <-sb.ready: + sb.lock.Unlock() + w.batch, err = w.newSyncBatch() + sb = w.batch + default: + } + } + if err != nil { + return err + } + err = sb.write(id, p, data) + if err == nil && sb.cnt >= combinedCountLimit || sb.size >= combinedSizeLimit { + sb.intSync() + } + sb.lock.Unlock() + w.batchLock.Unlock() + if err != nil { + return err } - return err + return sb.wait() } func (w *linuxWriter) writeFile(p string, data []byte) error { diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go index 3950d94ab9..b8f2cf1e4d 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go @@ -6,6 +6,6 @@ import ( "io/fs" ) -func newSpecificWriteData(_ string, _ fs.FileMode, _ bool) func(string, []byte) error { +func newSpecificWriter(_ string, _ fs.FileMode, _ bool) writer { return nil } From 9fc8cb430b2d41e092781532d591b20995756602 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Thu, 18 Apr 2024 23:55:19 +0300 Subject: [PATCH 03/12] fstree: don't do useless Stat() for Get Open() (and ReadFile()) will return the same exact error, we don't need to Stat(). Signed-off-by: Roman Khimov --- pkg/local_object_storage/blobstor/fstree/fstree.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index 84158ced3b..c86e090330 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -296,12 +296,11 @@ func (t *FSTree) Put(prm common.PutPrm) (common.PutRes, error) { func (t *FSTree) Get(prm common.GetPrm) (common.GetRes, error) { p := t.treePath(prm.Address) - if _, err := os.Stat(p); errors.Is(err, fs.ErrNotExist) { - return common.GetRes{}, logicerr.Wrap(apistatus.ObjectNotFound{}) - } - data, err := os.ReadFile(p) if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return common.GetRes{}, logicerr.Wrap(apistatus.ObjectNotFound{}) + } return common.GetRes{}, fmt.Errorf("read file %q: %w", p, err) } data, err = extractCombinedObject(prm.Address.Object(), data) From d99e49027173ac7e5354e471e6a6e00f3b6a2fc8 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Fri, 19 Apr 2024 00:00:24 +0300 Subject: [PATCH 04/12] fstree: simplify GetBytes We use ReadFile() for Get and it works just fine there, no reasons for it to not work here as well. Large files can overflow us anyway (but we're not supposed to have them in a proper storage). Signed-off-by: Roman Khimov --- .../blobstor/fstree/fstree.go | 28 ++----------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index c86e090330..927af10e9c 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -6,9 +6,7 @@ import ( "encoding/binary" "errors" "fmt" - "io" "io/fs" - "math" "os" "path/filepath" "strings" @@ -329,35 +327,13 @@ func (t *FSTree) Get(prm common.GetPrm) (common.GetRes, error) { func (t *FSTree) GetBytes(addr oid.Address) ([]byte, error) { p := t.treePath(addr) - f, err := os.Open(p) + b, err := os.ReadFile(p) if err != nil { if errors.Is(err, fs.ErrNotExist) { return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) } - return nil, fmt.Errorf("open object file %q: %w", p, err) + return nil, fmt.Errorf("read file %q: %w", p, err) } - - fi, err := f.Stat() - if err != nil { - return nil, fmt.Errorf("stat object file %q: %w", p, err) - } - sz := fi.Size() - if sz > math.MaxInt { - return nil, fmt.Errorf("too big object file %d > %d", sz, math.MaxInt) - } - if sz == 0 { - return nil, nil - } - - b := make([]byte, sz) - _, err = io.ReadFull(f, b) - if err != nil { - if errors.Is(err, io.EOF) { - err = io.ErrUnexpectedEOF - } - return nil, fmt.Errorf("read all %d bytes from object file %q: %w", sz, p, err) - } - b, err = extractCombinedObject(addr.Object(), b) if err != nil { if errors.Is(err, fs.ErrNotExist) { From 56ab649d9d962392cf75a086e54ba382888cc69b Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Fri, 19 Apr 2024 00:04:38 +0300 Subject: [PATCH 05/12] fstree: simplify compression handling in GetBytes() IsCompressed() is handled already internally in Decompress(), duplicating it doesn't make our code better. Signed-off-by: Roman Khimov --- pkg/local_object_storage/blobstor/fstree/fstree.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index 927af10e9c..f2750509e9 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -342,11 +342,7 @@ func (t *FSTree) GetBytes(addr oid.Address) ([]byte, error) { return nil, fmt.Errorf("extract object from %q: %w", p, err) } - if !t.IsCompressed(b) { - return b, nil - } - - dec, err := t.DecompressForce(b) + dec, err := t.Decompress(b) if err != nil { return nil, fmt.Errorf("decompress object file data %q: %w", p, err) } From 0d9ac779ecc79a935297f0a192f031e1c66a9d51 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Fri, 19 Apr 2024 00:21:15 +0300 Subject: [PATCH 06/12] fstree: deduplicate Get/GetBytes/Iterate code Signed-off-by: Roman Khimov --- .../blobstor/fstree/fstree.go | 61 ++++++++----------- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index f2750509e9..3b5f2c4915 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -152,14 +152,7 @@ func (t *FSTree) iterate(depth uint64, curPath []string, prm common.IteratePrm) if prm.LazyHandler != nil { err = prm.LazyHandler(*addr, func() ([]byte, error) { - data, err := os.ReadFile(filepath.Join(curPath...)) - if err != nil { - if errors.Is(err, fs.ErrNotExist) { - return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) - } - return nil, err - } - return extractCombinedObject(addr.Object(), data) + return getRawObjectBytes(addr.Object(), filepath.Join(curPath...)) }) } else { var data []byte @@ -292,30 +285,14 @@ func (t *FSTree) Put(prm common.PutPrm) (common.PutRes, error) { // Get returns an object from the storage by address. func (t *FSTree) Get(prm common.GetPrm) (common.GetRes, error) { - p := t.treePath(prm.Address) - - data, err := os.ReadFile(p) + data, err := t.getObjBytes(prm.Address) if err != nil { - if errors.Is(err, fs.ErrNotExist) { - return common.GetRes{}, logicerr.Wrap(apistatus.ObjectNotFound{}) - } - return common.GetRes{}, fmt.Errorf("read file %q: %w", p, err) - } - data, err = extractCombinedObject(prm.Address.Object(), data) - if err != nil { - if errors.Is(err, fs.ErrNotExist) { - return common.GetRes{}, logicerr.Wrap(apistatus.ObjectNotFound{}) - } - return common.GetRes{}, fmt.Errorf("extract object from %q: %w", p, err) - } - data, err = t.Decompress(data) - if err != nil { - return common.GetRes{}, fmt.Errorf("decompress file data %q: %w", p, err) + return common.GetRes{}, err } obj := objectSDK.New() if err := obj.Unmarshal(data); err != nil { - return common.GetRes{}, fmt.Errorf("decode object from file %q: %w", p, err) + return common.GetRes{}, fmt.Errorf("decode object: %w", err) } return common.GetRes{Object: obj, RawData: data}, nil @@ -325,29 +302,41 @@ func (t *FSTree) Get(prm common.GetPrm) (common.GetRes, error) { // canonical NeoFS binary format. Returns [apistatus.ObjectNotFound] if object // is missing. func (t *FSTree) GetBytes(addr oid.Address) ([]byte, error) { + return t.getObjBytes(addr) +} + +// getObjBytes extracts object bytes from the storage by address. +func (t *FSTree) getObjBytes(addr oid.Address) ([]byte, error) { p := t.treePath(addr) + data, err := getRawObjectBytes(addr.Object(), p) + if err != nil { + return nil, err + } + data, err = t.Decompress(data) + if err != nil { + return nil, fmt.Errorf("decompress file data %q: %w", p, err) + } + return data, nil +} - b, err := os.ReadFile(p) +// getRawObjectBytes extracts raw object bytes from the storage by path. No +// decompression is performed. +func getRawObjectBytes(id oid.ID, p string) ([]byte, error) { + data, err := os.ReadFile(p) if err != nil { if errors.Is(err, fs.ErrNotExist) { return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) } return nil, fmt.Errorf("read file %q: %w", p, err) } - b, err = extractCombinedObject(addr.Object(), b) + data, err = extractCombinedObject(id, data) if err != nil { if errors.Is(err, fs.ErrNotExist) { return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) } return nil, fmt.Errorf("extract object from %q: %w", p, err) } - - dec, err := t.Decompress(b) - if err != nil { - return nil, fmt.Errorf("decompress object file data %q: %w", p, err) - } - - return dec, nil + return data, nil } func extractCombinedObject(id oid.ID, data []byte) ([]byte, error) { From 68dfb3065b05dc0a683a142ddd5186fb5ee936c5 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Fri, 26 Apr 2024 12:26:24 +0300 Subject: [PATCH 07/12] docs: drop obsolete parameters from blobstor config example Peapod has no depth/width, FSTree has no width/size/opened_cache_capacity. Signed-off-by: Roman Khimov --- docs/storage-node-configuration.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/storage-node-configuration.md b/docs/storage-node-configuration.md index 51c2c0b2cf..911f6b1a21 100644 --- a/docs/storage-node-configuration.md +++ b/docs/storage-node-configuration.md @@ -180,15 +180,10 @@ Currently only 2 types are supported: `fstree` and `peapod`. blobstor: - type: peapod path: /path/to/peapod.db - depth: 1 - width: 4 - type: fstree path: /path/to/blobstor perm: 0644 - size: 4194304 depth: 1 - width: 4 - opened_cache_capacity: 50 ``` #### Common options for sub-storages From 56ace14e53225f7b894cbf2e7f414a3c6d4e7462 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Fri, 26 Apr 2024 12:29:24 +0300 Subject: [PATCH 08/12] docs: add no_sync doc for fstree Signed-off-by: Roman Khimov --- docs/storage-node-configuration.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/storage-node-configuration.md b/docs/storage-node-configuration.md index 911f6b1a21..dfeafb8153 100644 --- a/docs/storage-node-configuration.md +++ b/docs/storage-node-configuration.md @@ -198,6 +198,7 @@ blobstor: | `path` | `string` | | Path to the root of the blobstor. | | `perm` | file mode | `0640` | Default permission for created files and directories. | | `depth` | `int` | `4` | File-system tree depth. | +| `no_sync` | `bool` | `false` | Disable write synchronization, makes writes faster, but can lead to data loss. | #### `peapod` type options | Parameter | Type | Default value | Description | From d8183e43663fb9badb78c00bd400e69a23e66624 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Fri, 26 Apr 2024 14:53:50 +0300 Subject: [PATCH 09/12] fstree: make combined writer configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't need to combine for SSDs: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics │ ssd.fstree-original │ ssd.fstree-combined │ │ sec/op │ sec/op vs base │ Put/size=1,thread=1-16 91.98µ ± 2% 14040.16µ ± 1% +15164.94% (p=0.000 n=10) Put/size=1,thread=20-16 1.010m ± 1% 15.095m ± 2% +1393.89% (p=0.000 n=10) Put/size=1,thread=100-16 5.272m ± 1% 17.306m ± 3% +228.27% (p=0.000 n=10) Put/size=1,thread=200-16 10.77m ± 1% 33.77m ± 2% +213.52% (p=0.000 n=10) Put/size=256,thread=1-16 91.37µ ± 2% 14018.07µ ± 1% +15242.09% (p=0.000 n=10) Put/size=256,thread=20-16 1.028m ± 4% 15.583m ± 2% +1415.81% (p=0.000 n=10) Put/size=256,thread=100-16 5.420m ± 1% 16.869m ± 6% +211.22% (p=0.000 n=10) Put/size=256,thread=200-16 12.42m ± 11% 29.55m ± 2% +137.89% (p=0.000 n=10) Put/size=1024,thread=1-16 85.38µ ± 7% 12301.06µ ± 0% +14306.75% (p=0.000 n=10) Put/size=1024,thread=20-16 1.053m ± 4% 13.855m ± 2% +1216.07% (p=0.000 n=10) Put/size=1024,thread=100-16 5.629m ± 1% 15.694m ± 2% +178.82% (p=0.000 n=10) Put/size=1024,thread=200-16 12.55m ± 9% 30.10m ± 1% +139.83% (p=0.000 n=10) Put/size=4096,thread=1-16 150.6µ ± 6% 12864.7µ ± 1% +8441.32% (p=0.000 n=10) Put/size=4096,thread=20-16 1.308m ± 1% 14.136m ± 2% +980.81% (p=0.000 n=10) Put/size=4096,thread=100-16 6.670m ± 2% 16.101m ± 2% +141.40% (p=0.000 n=10) Put/size=4096,thread=200-16 13.45m ± 1% 31.72m ± 2% +135.88% (p=0.000 n=10) Put/size=16384,thread=1-16 181.7µ ± 5% 12891.6µ ± 1% +6994.13% (p=0.000 n=18+10) Put/size=16384,thread=20-16 1.131m ± 4% 14.592m ± 1% +1189.98% (p=0.000 n=10) Put/size=16384,thread=100-16 6.060m ± 31% 17.599m ± 2% +190.41% (p=0.000 n=10) Put/size=16384,thread=200-16 12.38m ± 2% 35.42m ± 1% +185.99% (p=0.000 n=10) Put/size=65536,thread=1-16 282.8µ ± 5% 13109.3µ ± 1% +4535.47% (p=0.000 n=10) Put/size=65536,thread=20-16 1.599m ± 10% 16.549m ± 2% +935.19% (p=0.000 n=10) Put/size=65536,thread=100-16 8.472m ± 4% 23.581m ± 7% +178.34% (p=0.000 n=10) Put/size=65536,thread=200-16 16.60m ± 2% 42.20m ± 2% +154.30% (p=0.000 n=10) Put/size=262144,thread=1-16 577.9µ ± 4% 13673.9µ ± 4% +2266.11% (p=0.000 n=10) Put/size=262144,thread=20-16 3.961m ± 30% 18.688m ± 4% +371.76% (p=0.000 n=10) Put/size=262144,thread=100-16 20.09m ± 36% 68.14m ± 2% +239.16% (p=0.000 n=10) Put/size=262144,thread=200-16 38.53m ± 33% 113.45m ± 23% +194.48% (p=0.000 n=10) Put/size=1048576,thread=1-16 1.766m ± 5% 17.167m ± 1% +871.86% (p=0.000 n=10) Put/size=1048576,thread=20-16 15.71m ± 21% 40.61m ± 18% +158.54% (p=0.000 n=10) Put/size=1048576,thread=100-16 61.86m ± 25% 197.93m ± 3% +219.98% (p=0.000 n=10) Put/size=1048576,thread=200-16 123.7m ± 18% 353.1m ± 4% +185.47% (p=0.000 n=10) Put/size=4194304,thread=1-16 5.485m ± 2% 20.737m ± 1% +278.07% (p=0.000 n=10) Put/size=4194304,thread=20-16 51.55m ± 16% 143.20m ± 9% +177.79% (p=0.000 n=10) Put/size=4194304,thread=100-16 260.4m ± 20% 700.5m ± 4% +169.04% (p=0.000 n=10) Put/size=4194304,thread=200-16 521.9m ± 11% 1356.2m ± 10% +159.86% (p=0.000 n=10) geomean 4.278m 31.30m +631.70% │ ssd.fstree-original │ ssd.fstree-combined │ │ B/op │ B/op vs base │ Put/size=1,thread=1-16 1.904Ki ± 0% 2.678Ki ± 1% +40.68% (p=0.000 n=10) Put/size=1,thread=20-16 36.94Ki ± 0% 42.40Ki ± 1% +14.76% (p=0.000 n=10) Put/size=1,thread=100-16 186.9Ki ± 0% 194.3Ki ± 0% +3.99% (p=0.000 n=10) Put/size=1,thread=200-16 370.9Ki ± 1% 389.5Ki ± 0% +5.01% (p=0.000 n=10) Put/size=256,thread=1-16 1.917Ki ± 0% 2.703Ki ± 0% +40.98% (p=0.000 n=10) Put/size=256,thread=20-16 37.51Ki ± 0% 42.44Ki ± 0% +13.14% (p=0.000 n=10) Put/size=256,thread=100-16 188.3Ki ± 1% 195.0Ki ± 1% +3.53% (p=0.000 n=10) Put/size=256,thread=200-16 378.5Ki ± 1% 389.8Ki ± 1% +3.00% (p=0.000 n=10) Put/size=1024,thread=1-16 1.914Ki ± 0% 2.699Ki ± 0% +40.99% (p=0.000 n=10) Put/size=1024,thread=20-16 37.57Ki ± 1% 42.43Ki ± 1% +12.95% (p=0.000 n=10) Put/size=1024,thread=100-16 190.3Ki ± 1% 196.5Ki ± 1% +3.28% (p=0.000 n=10) Put/size=1024,thread=200-16 380.9Ki ± 0% 392.4Ki ± 1% +3.02% (p=0.000 n=10) Put/size=4096,thread=1-16 1.952Ki ± 0% 2.696Ki ± 1% +38.12% (p=0.000 n=10) Put/size=4096,thread=20-16 38.12Ki ± 1% 42.28Ki ± 1% +10.91% (p=0.000 n=10) Put/size=4096,thread=100-16 191.5Ki ± 1% 196.8Ki ± 0% +2.81% (p=0.000 n=10) Put/size=4096,thread=200-16 382.1Ki ± 1% 392.8Ki ± 0% +2.80% (p=0.000 n=10) Put/size=16384,thread=1-16 1.977Ki ± 1% 2.707Ki ± 0% +36.90% (p=0.000 n=18+10) Put/size=16384,thread=20-16 38.00Ki ± 0% 42.83Ki ± 1% +12.71% (p=0.000 n=10) Put/size=16384,thread=100-16 191.1Ki ± 0% 198.0Ki ± 0% +3.63% (p=0.000 n=10) Put/size=16384,thread=200-16 382.2Ki ± 0% 395.3Ki ± 0% +3.43% (p=0.000 n=10) Put/size=65536,thread=1-16 2.020Ki ± 1% 2.715Ki ± 0% +34.40% (p=0.000 n=10) Put/size=65536,thread=20-16 38.42Ki ± 1% 43.22Ki ± 0% +12.51% (p=0.000 n=10) Put/size=65536,thread=100-16 193.1Ki ± 0% 200.1Ki ± 1% +3.64% (p=0.000 n=10) Put/size=65536,thread=200-16 386.1Ki ± 0% 400.0Ki ± 0% +3.61% (p=0.000 n=10) Put/size=262144,thread=1-16 2.119Ki ± 0% 2.725Ki ± 1% +28.57% (p=0.000 n=10) Put/size=262144,thread=20-16 39.47Ki ± 1% 43.49Ki ± 0% +10.18% (p=0.000 n=10) Put/size=262144,thread=100-16 197.9Ki ± 1% 214.4Ki ± 1% +8.31% (p=0.000 n=10) Put/size=262144,thread=200-16 395.9Ki ± 1% 422.7Ki ± 2% +6.77% (p=0.000 n=10) Put/size=1048576,thread=1-16 2.243Ki ± 0% 2.770Ki ± 1% +23.51% (p=0.000 n=10) Put/size=1048576,thread=20-16 42.91Ki ± 3% 46.23Ki ± 5% +7.73% (p=0.000 n=10) Put/size=1048576,thread=100-16 210.3Ki ± 2% 229.8Ki ± 2% +9.27% (p=0.000 n=10) Put/size=1048576,thread=200-16 427.9Ki ± 2% 460.2Ki ± 1% +7.55% (p=0.000 n=10) Put/size=4194304,thread=1-16 2.325Ki ± 1% 2.771Ki ± 1% +19.17% (p=0.000 n=10) Put/size=4194304,thread=20-16 45.24Ki ± 2% 50.57Ki ± 1% +11.79% (p=0.000 n=10) Put/size=4194304,thread=100-16 231.7Ki ± 2% 257.8Ki ± 3% +11.26% (p=0.000 n=10) Put/size=4194304,thread=200-16 474.7Ki ± 4% 509.5Ki ± 11% +7.33% (p=0.000 n=10) geomean 50.02Ki 56.68Ki +13.32% │ ssd.fstree-original │ ssd.fstree-combined │ │ allocs/op │ allocs/op vs base │ Put/size=1,thread=1-16 22.00 ± 0% 32.00 ± 3% +45.45% (p=0.000 n=10) Put/size=1,thread=20-16 413.0 ± 0% 473.5 ± 1% +14.65% (p=0.000 n=10) Put/size=1,thread=100-16 2.069k ± 0% 2.167k ± 0% +4.71% (p=0.000 n=10) Put/size=1,thread=200-16 4.137k ± 0% 4.338k ± 0% +4.87% (p=0.000 n=10) Put/size=256,thread=1-16 22.00 ± 0% 32.00 ± 0% +45.45% (p=0.000 n=10) Put/size=256,thread=20-16 415.0 ± 0% 473.5 ± 0% +14.10% (p=0.000 n=10) Put/size=256,thread=100-16 2.070k ± 0% 2.163k ± 0% +4.47% (p=0.000 n=10) Put/size=256,thread=200-16 4.160k ± 0% 4.319k ± 0% +3.83% (p=0.000 n=10) Put/size=1024,thread=1-16 22.00 ± 5% 32.00 ± 0% +45.45% (p=0.000 n=10) Put/size=1024,thread=20-16 413.0 ± 1% 470.0 ± 1% +13.80% (p=0.000 n=10) Put/size=1024,thread=100-16 2.074k ± 0% 2.160k ± 0% +4.15% (p=0.000 n=10) Put/size=1024,thread=200-16 4.155k ± 0% 4.319k ± 0% +3.96% (p=0.000 n=10) Put/size=4096,thread=1-16 22.00 ± 0% 32.00 ± 0% +45.45% (p=0.000 n=10) Put/size=4096,thread=20-16 419.0 ± 0% 469.0 ± 0% +11.93% (p=0.000 n=10) Put/size=4096,thread=100-16 2.088k ± 0% 2.162k ± 0% +3.54% (p=0.000 n=10) Put/size=4096,thread=200-16 4.173k ± 0% 4.319k ± 0% +3.50% (p=0.000 n=10) Put/size=16384,thread=1-16 22.00 ± 0% 32.00 ± 0% +45.45% (p=0.000 n=18+10) Put/size=16384,thread=20-16 414.0 ± 0% 471.0 ± 0% +13.77% (p=0.000 n=10) Put/size=16384,thread=100-16 2.082k ± 1% 2.173k ± 0% +4.37% (p=0.000 n=10) Put/size=16384,thread=200-16 4.162k ± 0% 4.340k ± 0% +4.29% (p=0.000 n=10) Put/size=65536,thread=1-16 23.00 ± 4% 32.00 ± 0% +39.13% (p=0.000 n=10) Put/size=65536,thread=20-16 420.0 ± 0% 476.5 ± 1% +13.45% (p=0.000 n=10) Put/size=65536,thread=100-16 2.103k ± 0% 2.197k ± 1% +4.49% (p=0.000 n=10) Put/size=65536,thread=200-16 4.204k ± 0% 4.389k ± 0% +4.41% (p=0.000 n=10) Put/size=262144,thread=1-16 23.00 ± 0% 32.00 ± 0% +39.13% (p=0.000 n=10) Put/size=262144,thread=20-16 430.0 ± 1% 478.5 ± 0% +11.28% (p=0.000 n=10) Put/size=262144,thread=100-16 2.162k ± 2% 2.354k ± 1% +8.91% (p=0.000 n=10) Put/size=262144,thread=200-16 4.362k ± 2% 4.641k ± 2% +6.38% (p=0.000 n=10) Put/size=1048576,thread=1-16 25.00 ± 0% 33.00 ± 3% +32.00% (p=0.000 n=10) Put/size=1048576,thread=20-16 464.5 ± 3% 509.5 ± 2% +9.69% (p=0.000 n=10) Put/size=1048576,thread=100-16 2.322k ± 2% 2.535k ± 1% +9.17% (p=0.000 n=10) Put/size=1048576,thread=200-16 4.760k ± 1% 5.062k ± 1% +6.36% (p=0.000 n=10) Put/size=4194304,thread=1-16 26.00 ± 4% 33.00 ± 0% +26.92% (p=0.000 n=10) Put/size=4194304,thread=20-16 491.0 ± 2% 573.0 ± 1% +16.70% (p=0.000 n=10) Put/size=4194304,thread=100-16 2.550k ± 2% 2.914k ± 3% +14.29% (p=0.000 n=10) Put/size=4194304,thread=200-16 5.254k ± 4% 5.767k ± 10% +9.76% (p=0.000 n=10) geomean 552.5 638.2 +15.50% This makes flush_interval common for peapod and fstree since they're almost the same in meaning and then there are no peapod-specific configurations left. Signed-off-by: Roman Khimov --- CHANGELOG.md | 1 + cmd/neofs-lens/internal/storage/root.go | 14 +++-- cmd/neofs-lens/internal/storage/sanity.go | 4 +- cmd/neofs-node/config.go | 8 +-- cmd/neofs-node/config/engine/config_test.go | 7 +-- .../engine/shard/blobstor/fstree/config.go | 53 ++++++++++++++++++- .../engine/shard/blobstor/peapod/config.go | 34 ------------ .../engine/shard/blobstor/storage/config.go | 23 +++++++- cmd/neofs-node/storage.go | 6 ++- cmd/neofs-node/storage/config.go | 17 +++--- config/example/node.json | 6 ++- config/example/node.yaml | 4 ++ docs/storage-node-configuration.md | 17 +++--- .../blobstor/fstree/control.go | 2 +- .../blobstor/fstree/fstree.go | 11 ++++ .../blobstor/fstree/fstree_write_linux.go | 36 +++++++------ .../blobstor/fstree/fstree_write_specific.go | 6 +-- .../blobstor/fstree/option.go | 25 +++++++++ 18 files changed, 181 insertions(+), 93 deletions(-) delete mode 100644 cmd/neofs-node/config/engine/shard/blobstor/peapod/config.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bf0697479..8bde550f92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Changelog for NeoFS Node ## [Unreleased] ### Added +- More effective FSTree writer for HDDs, new configuration options for it (#2814) ### Fixed diff --git a/cmd/neofs-lens/internal/storage/root.go b/cmd/neofs-lens/internal/storage/root.go index b4a715d211..8101cc99b2 100644 --- a/cmd/neofs-lens/internal/storage/root.go +++ b/cmd/neofs-lens/internal/storage/root.go @@ -9,7 +9,6 @@ import ( engineconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine" shardconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard" fstreeconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/fstree" - peapodconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/peapod" "github.com/nspcc-dev/neofs-node/cmd/neofs-node/storage" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/fstree" @@ -119,15 +118,18 @@ func openEngine(cmd *cobra.Command) *engine.StorageEngine { sCfg.Typ = storagesCfg[i].Type() sCfg.Path = storagesCfg[i].Path() sCfg.Perm = storagesCfg[i].Perm() + sCfg.FlushInterval = storagesCfg[i].FlushInterval() switch storagesCfg[i].Type() { case fstree.Type: sub := fstreeconfig.From((*config.Config)(storagesCfg[i])) sCfg.Depth = sub.Depth() sCfg.NoSync = sub.NoSync() + sCfg.CombinedCountLimit = sub.CombinedCountLimit() + sCfg.CombinedSizeLimit = sub.CombinedSizeLimit() + sCfg.CombinedSizeThreshold = sub.CombinedSizeThreshold() case peapod.Type: - peapodCfg := peapodconfig.From((*config.Config)(storagesCfg[i])) - sCfg.FlushInterval = peapodCfg.FlushInterval() + // Nothing peapod-specific, but it should work. default: return fmt.Errorf("can't initiate storage. invalid storage type: %s", storagesCfg[i].Type()) } @@ -193,7 +195,11 @@ func openEngine(cmd *cobra.Command) *engine.StorageEngine { fstree.WithPath(sRead.Path), fstree.WithPerm(sRead.Perm), fstree.WithDepth(sRead.Depth), - fstree.WithNoSync(sRead.NoSync)), + fstree.WithNoSync(sRead.NoSync), + fstree.WithCombinedCountLimit(sRead.CombinedCountLimit), + fstree.WithCombinedSizeLimit(sRead.CombinedSizeLimit), + fstree.WithCombinedSizeThreshold(sRead.CombinedSizeThreshold), + fstree.WithCombinedWriteInterval(sRead.FlushInterval)), Policy: func(_ *objectSDK.Object, data []byte) bool { return true }, diff --git a/cmd/neofs-lens/internal/storage/sanity.go b/cmd/neofs-lens/internal/storage/sanity.go index 53c0b63246..3209fecbd7 100644 --- a/cmd/neofs-lens/internal/storage/sanity.go +++ b/cmd/neofs-lens/internal/storage/sanity.go @@ -13,7 +13,6 @@ import ( engineconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine" shardconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard" fstreeconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/fstree" - peapodconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/peapod" objectcore "github.com/nspcc-dev/neofs-node/pkg/core/object" commonb "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/compression" @@ -77,8 +76,7 @@ func sanityCheck(cmd *cobra.Command, _ []string) { default: return fmt.Errorf("unsupported sub-storage type '%s'", subCfg.Type()) case peapod.Type: - peapodCfg := peapodconfig.From((*config.Config)(subCfg)) - sh.p = peapod.New(subCfg.Path(), subCfg.Perm(), peapodCfg.FlushInterval()) + sh.p = peapod.New(subCfg.Path(), subCfg.Perm(), subCfg.FlushInterval()) var compressCfg compression.Config err := compressCfg.Init() diff --git a/cmd/neofs-node/config.go b/cmd/neofs-node/config.go index e6de08ec90..d603482cc1 100644 --- a/cmd/neofs-node/config.go +++ b/cmd/neofs-node/config.go @@ -23,7 +23,6 @@ import ( engineconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine" shardconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard" fstreeconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/fstree" - peapodconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/peapod" loggerconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/logger" metricsconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/metrics" morphconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/morph" @@ -224,15 +223,18 @@ func (a *applicationConfiguration) readConfig(c *config.Config) error { sCfg.Typ = storagesCfg[i].Type() sCfg.Path = storagesCfg[i].Path() sCfg.Perm = storagesCfg[i].Perm() + sCfg.FlushInterval = storagesCfg[i].FlushInterval() switch storagesCfg[i].Type() { case fstree.Type: sub := fstreeconfig.From((*config.Config)(storagesCfg[i])) sCfg.Depth = sub.Depth() sCfg.NoSync = sub.NoSync() + sCfg.CombinedCountLimit = sub.CombinedCountLimit() + sCfg.CombinedSizeLimit = sub.CombinedSizeLimit() + sCfg.CombinedSizeThreshold = sub.CombinedSizeThreshold() case peapod.Type: - peapodCfg := peapodconfig.From((*config.Config)(storagesCfg[i])) - sCfg.FlushInterval = peapodCfg.FlushInterval() + // No specific configs, but it's a valid storage type. default: return fmt.Errorf("invalid storage type: %s", storagesCfg[i].Type()) } diff --git a/cmd/neofs-node/config/engine/config_test.go b/cmd/neofs-node/config/engine/config_test.go index 160f455d4f..de3b66a6aa 100644 --- a/cmd/neofs-node/config/engine/config_test.go +++ b/cmd/neofs-node/config/engine/config_test.go @@ -9,7 +9,6 @@ import ( engineconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine" shardconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard" fstreeconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/fstree" - peapodconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/blobstor/peapod" piloramaconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/engine/shard/pilorama" configtest "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/test" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/peapod" @@ -87,11 +86,10 @@ func TestEngineSection(t *testing.T) { require.EqualValues(t, 102400, sc.SmallSizeLimit()) require.Equal(t, 2, len(ss)) - ppd := peapodconfig.From((*config.Config)(ss[0])) require.Equal(t, "tmp/0/blob/peapod.db", ss[0].Path()) require.EqualValues(t, 0644, ss[0].Perm()) require.EqualValues(t, peapod.Type, ss[0].Type()) - require.EqualValues(t, 10*time.Millisecond, ppd.FlushInterval()) + require.EqualValues(t, 10*time.Millisecond, ss[0].FlushInterval()) require.Equal(t, "tmp/0/blob", ss[1].Path()) require.EqualValues(t, 0644, ss[1].Perm()) @@ -131,11 +129,10 @@ func TestEngineSection(t *testing.T) { require.EqualValues(t, 102400, sc.SmallSizeLimit()) require.Equal(t, 2, len(ss)) - ppd := peapodconfig.From((*config.Config)(ss[0])) require.Equal(t, "tmp/1/blob/peapod.db", ss[0].Path()) require.EqualValues(t, 0644, ss[0].Perm()) require.EqualValues(t, peapod.Type, ss[0].Type()) - require.EqualValues(t, 30*time.Millisecond, ppd.FlushInterval()) + require.EqualValues(t, 30*time.Millisecond, ss[0].FlushInterval()) require.Equal(t, "tmp/1/blob", ss[1].Path()) require.EqualValues(t, 0644, ss[1].Perm()) diff --git a/cmd/neofs-node/config/engine/shard/blobstor/fstree/config.go b/cmd/neofs-node/config/engine/shard/blobstor/fstree/config.go index 6595e9375d..43c2de6d82 100644 --- a/cmd/neofs-node/config/engine/shard/blobstor/fstree/config.go +++ b/cmd/neofs-node/config/engine/shard/blobstor/fstree/config.go @@ -1,16 +1,27 @@ package fstree import ( + "math" + "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/fstree" + "github.com/spf13/cast" ) // Config is a wrapper over the config section // which provides access to FSTree configurations. type Config config.Config -// DepthDefault is a default shallow dir depth. -const DepthDefault = 4 +const ( + // DepthDefault is the default shallow dir depth. + DepthDefault = 4 + // CombinedCountLimitDefault is the default for the maximum number of objects to write into a single file. + CombinedCountLimitDefault = 128 + // CombinedSizeLimitDefault is the default for the maximum size of the combined object file. + CombinedSizeLimitDefault = 8 * 1024 * 1024 + // CombinedSizeThresholdDefault is the default for the minimal size of the object that won't be combined with others for writes. + CombinedSizeThresholdDefault = 128 * 1024 +) // From wraps config section into Config. func From(c *config.Config) *Config { @@ -45,3 +56,41 @@ func (x *Config) Depth() uint64 { func (x *Config) NoSync() bool { return config.BoolSafe((*config.Config)(x), "no_sync") } + +// CombinedCountLimit returns the value of "combined_count_limit" config parameter. +// +// Returns [CombinedCountLimitDefault] if the value is missing or not a positive integer. +func (x *Config) CombinedCountLimit() int { + var v = (*config.Config)(x).Value("combined_count_limit") + if v == nil { + return CombinedCountLimitDefault + } + + i, err := cast.ToIntE(v) + if err != nil { + return CombinedCountLimitDefault + } + return i +} + +// CombinedSizeLimit returns the value of "combined_size_limit" config parameter. +// +// Returns [CombinedSizeLimitDefault] if the value is missing, equal to 0 or not a proper size specification. +func (x *Config) CombinedSizeLimit() int { + var s = config.SizeInBytesSafe((*config.Config)(x), "combined_size_limit") + if s == 0 || s > math.MaxInt { + return CombinedSizeLimitDefault + } + return int(s) +} + +// CombinedSizeThreshold returns the value of "combined_size_threshold" config parameter. +// +// Returns [CombinedSizeThresholdDefault] if the value is missing, equal to 0 or not a proper size specification. +func (x *Config) CombinedSizeThreshold() int { + var s = config.SizeInBytesSafe((*config.Config)(x), "combined_size_threshold") + if s == 0 || s > math.MaxInt { + return CombinedSizeThresholdDefault + } + return int(s) +} diff --git a/cmd/neofs-node/config/engine/shard/blobstor/peapod/config.go b/cmd/neofs-node/config/engine/shard/blobstor/peapod/config.go deleted file mode 100644 index cd01d21ca9..0000000000 --- a/cmd/neofs-node/config/engine/shard/blobstor/peapod/config.go +++ /dev/null @@ -1,34 +0,0 @@ -package peapodconfig - -import ( - "time" - - "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config" -) - -// Config is a wrapper over the config section -// which provides access to Peapod configurations. -type Config config.Config - -// Various Peapod config defaults. -const ( - // DefaultFlushInterval is a default time interval between Peapod's batch writes - // to disk. - DefaultFlushInterval = 10 * time.Millisecond -) - -// From wraps config section into Config. -func From(c *config.Config) *Config { - return (*Config)(c) -} - -// FlushInterval returns the value of "flush_interval" config parameter. -// -// Returns DefaultFlushInterval if the value is not a positive duration. -func (x *Config) FlushInterval() time.Duration { - d := config.DurationSafe((*config.Config)(x), "flush_interval") - if d > 0 { - return d - } - return DefaultFlushInterval -} diff --git a/cmd/neofs-node/config/engine/shard/blobstor/storage/config.go b/cmd/neofs-node/config/engine/shard/blobstor/storage/config.go index 4a7d879e0d..32b78686a1 100644 --- a/cmd/neofs-node/config/engine/shard/blobstor/storage/config.go +++ b/cmd/neofs-node/config/engine/shard/blobstor/storage/config.go @@ -2,14 +2,22 @@ package storage import ( "io/fs" + "time" "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config" ) type Config config.Config -// PermDefault are default permission bits for BlobStor data. -const PermDefault = 0o640 +// Various config defaults. +const ( + // PermDefault are default permission bits for BlobStor data. + PermDefault = 0o640 + + // DefaultFlushInterval is the default time interval between Peapod's batch writes + // to disk. + DefaultFlushInterval = 10 * time.Millisecond +) func From(x *config.Config) *Config { return (*Config)(x) @@ -53,3 +61,14 @@ func (x *Config) Perm() fs.FileMode { return fs.FileMode(p) } + +// FlushInterval returns the value of "flush_interval" config parameter. +// +// Returns DefaultFlushInterval if the value is not a positive duration. +func (x *Config) FlushInterval() time.Duration { + d := config.DurationSafe((*config.Config)(x), "flush_interval") + if d > 0 { + return d + } + return DefaultFlushInterval +} diff --git a/cmd/neofs-node/storage.go b/cmd/neofs-node/storage.go index f613281f58..13714fbfbb 100644 --- a/cmd/neofs-node/storage.go +++ b/cmd/neofs-node/storage.go @@ -153,7 +153,11 @@ func (c *cfg) shardOpts() []shardOptsWithID { fstree.WithPath(sRead.Path), fstree.WithPerm(sRead.Perm), fstree.WithDepth(sRead.Depth), - fstree.WithNoSync(sRead.NoSync)), + fstree.WithNoSync(sRead.NoSync), + fstree.WithCombinedCountLimit(sRead.CombinedCountLimit), + fstree.WithCombinedSizeLimit(sRead.CombinedSizeLimit), + fstree.WithCombinedSizeThreshold(sRead.CombinedSizeThreshold), + fstree.WithCombinedWriteInterval(sRead.FlushInterval)), Policy: func(_ *objectSDK.Object, data []byte) bool { return true }, diff --git a/cmd/neofs-node/storage/config.go b/cmd/neofs-node/storage/config.go index b4b69d7d49..8ed85196ac 100644 --- a/cmd/neofs-node/storage/config.go +++ b/cmd/neofs-node/storage/config.go @@ -53,16 +53,17 @@ type ShardCfg struct { } type SubStorageCfg struct { // common for all storages - Typ string - Path string - Perm fs.FileMode + Typ string + Path string + Perm fs.FileMode + FlushInterval time.Duration // tree-specific (FS) - Depth uint64 - NoSync bool - - // Peapod-specific - FlushInterval time.Duration + Depth uint64 + NoSync bool + CombinedCountLimit int + CombinedSizeLimit int + CombinedSizeThreshold int } // ID returns persistent id of a shard. It is different from the ID used in runtime diff --git a/config/example/node.json b/config/example/node.json index d07a7fd8e6..a0f3e786d5 100644 --- a/config/example/node.json +++ b/config/example/node.json @@ -199,7 +199,11 @@ "path": "tmp/1/blob", "no_sync": true, "perm": "0644", - "depth": 5 + "depth": 5, + "flush_interval": "20ms", + "combined_count_limit": 64, + "combined_size_limit": "16M", + "combined_size_threshold": "512K" } ], "pilorama": { diff --git a/config/example/node.yaml b/config/example/node.yaml index 9dad016afc..5b3b2d9ac9 100644 --- a/config/example/node.yaml +++ b/config/example/node.yaml @@ -196,6 +196,10 @@ storage: - type: fstree path: tmp/1/blob # blobstor path no_sync: true + flush_interval: 20ms # time interval between combined file writes to disk (defaults to 10ms) + combined_count_limit: 64 # number of small objects to write into a single file (defaults to 128) + combined_size_limit: 16M # limit for the multi-object file size (defaults to 8M) + combined_size_threshold: 512K # threshold for combined object writing (defaults to 128K) pilorama: path: tmp/1/blob/pilorama.db diff --git a/docs/storage-node-configuration.md b/docs/storage-node-configuration.md index dfeafb8153..d39ebeee55 100644 --- a/docs/storage-node-configuration.md +++ b/docs/storage-node-configuration.md @@ -191,21 +191,24 @@ blobstor: |-------------------------------------|-----------------------------------------------|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `path` | `string` | | Path to the root of the blobstor. | | `perm` | file mode | `0640` | Default permission for created files and directories. | +| `flush_interval` | `duration` | `10ms` | Time interval between batch writes to disk. | #### `fstree` type options -| Parameter | Type | Default value | Description | -|---------------------|-----------|---------------|-------------------------------------------------------| -| `path` | `string` | | Path to the root of the blobstor. | -| `perm` | file mode | `0640` | Default permission for created files and directories. | -| `depth` | `int` | `4` | File-system tree depth. | -| `no_sync` | `bool` | `false` | Disable write synchronization, makes writes faster, but can lead to data loss. | +| Parameter | Type | Default value | Description | +|---------------------------|-----------|---------------|------------------------------------------------------------------------------------------------------------------------------| +| `path` | `string` | | Path to the root of the blobstor. | +| `perm` | file mode | `0640` | Default permission for created files and directories. | +| `depth` | `int` | `4` | File-system tree depth. | +| `no_sync` | `bool` | `false` | Disable write synchronization, makes writes faster, but can lead to data loss. | +| `combined_count_limit` | `int` | `128` | Maximum number of objects to write into a single file, 0 or 1 disables combined writing (disabling is recommended for SSDs). | +| `combined_size_limit` | `size` | `8M` | Maximum size of a multi-object file. | +| `combined_size_threshold` | `size` | `128K` | Minimum size of object that won't be combined with others when writing to disk. | #### `peapod` type options | Parameter | Type | Default value | Description | |---------------------|-----------|---------------|-------------------------------------------------------| | `path` | `string` | | Path to the Peapod database file. | | `perm` | file mode | `0640` | Default permission for created files and directories. | -| `flush_interval` | `duration`| `10ms` | Time interval between batch writes to disk. | ### `gc` subsection diff --git a/pkg/local_object_storage/blobstor/fstree/control.go b/pkg/local_object_storage/blobstor/fstree/control.go index 243151d9f6..b6fc940042 100644 --- a/pkg/local_object_storage/blobstor/fstree/control.go +++ b/pkg/local_object_storage/blobstor/fstree/control.go @@ -19,7 +19,7 @@ func (t *FSTree) Init() error { return fmt.Errorf("mkdir all for %q: %w", t.RootPath, err) } if !t.readOnly { - var w = newSpecificWriter(t.RootPath, t.Permissions, t.noSync) + var w = newSpecificWriter(t) if w != nil { t.writer = w } diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index 3b5f2c4915..325b4811b8 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -10,6 +10,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/compression" @@ -32,6 +33,11 @@ type FSTree struct { noSync bool readOnly bool + + combinedCountLimit int + combinedSizeLimit int + combinedSizeThreshold int + combinedWriteInterval time.Duration } // Info groups the information about file storage. @@ -73,6 +79,11 @@ func New(opts ...Option) *FSTree { Config: nil, Depth: 4, DirNameLen: DirNameLen, + + combinedCountLimit: 128, + combinedSizeLimit: 8 * 1024 * 1024, + combinedSizeThreshold: 128 * 1024, + combinedWriteInterval: 10 * time.Millisecond, } for i := range opts { opts[i](f) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go index 42f4a22e0f..26acc76e54 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go @@ -6,7 +6,6 @@ import ( "encoding/binary" "errors" "fmt" - "io/fs" "strconv" "sync" "time" @@ -16,13 +15,6 @@ import ( "golang.org/x/sys/unix" ) -const ( - defaultTick = 10 * time.Millisecond - combinedSizeThresh = 128 * 1024 - combinedSizeLimit = 8 * 1024 * 1024 - combinedCountLimit = 128 -) - type linuxWriter struct { root string perm uint32 @@ -30,6 +22,11 @@ type linuxWriter struct { bFlags int noSync bool + combinedCountLimit int + combinedSizeLimit int + combinedSizeThreshold int + combinedWriteInterval time.Duration + batchLock sync.Mutex batch *syncBatch } @@ -46,23 +43,28 @@ type syncBatch struct { err error } -func newSpecificWriter(root string, perm fs.FileMode, noSync bool) writer { +func newSpecificWriter(t *FSTree) writer { flags := unix.O_WRONLY | unix.O_TMPFILE | unix.O_CLOEXEC bFlags := flags - if !noSync { + if !t.noSync { flags |= unix.O_DSYNC } - fd, err := unix.Open(root, flags, uint32(perm)) + fd, err := unix.Open(t.RootPath, flags, uint32(t.Permissions)) if err != nil { return nil // Which means that OS-specific writeData can't be created and FSTree should use the generic one. } _ = unix.Close(fd) // Don't care about error. w := &linuxWriter{ - root: root, - perm: uint32(perm), + root: t.RootPath, + perm: uint32(t.Permissions), flags: flags, bFlags: bFlags, - noSync: noSync, + noSync: t.noSync, + + combinedCountLimit: t.combinedCountLimit, + combinedSizeLimit: t.combinedSizeLimit, + combinedSizeThreshold: t.combinedSizeThreshold, + combinedWriteInterval: t.combinedWriteInterval, } return w } @@ -79,7 +81,7 @@ func (w *linuxWriter) newSyncBatch() (*syncBatch, error) { noSync: w.noSync, } sb.lock.Lock() - sb.timer = time.AfterFunc(defaultTick, sb.sync) + sb.timer = time.AfterFunc(w.combinedWriteInterval, sb.sync) return sb, nil } @@ -165,7 +167,7 @@ func (w *linuxWriter) finalize() error { func (w *linuxWriter) writeData(id oid.ID, p string, data []byte) error { var err error - if len(data) > combinedSizeThresh { + if len(data) > w.combinedSizeThreshold || w.combinedCountLimit < 2 { err = w.writeFile(p, data) } else { err = w.writeCombinedFile(id, p, data) @@ -202,7 +204,7 @@ func (w *linuxWriter) writeCombinedFile(id oid.ID, p string, data []byte) error return err } err = sb.write(id, p, data) - if err == nil && sb.cnt >= combinedCountLimit || sb.size >= combinedSizeLimit { + if err == nil && sb.cnt >= w.combinedCountLimit || sb.size >= w.combinedSizeLimit { sb.intSync() } sb.lock.Unlock() diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go index b8f2cf1e4d..fc0ebf840f 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_specific.go @@ -2,10 +2,6 @@ package fstree -import ( - "io/fs" -) - -func newSpecificWriter(_ string, _ fs.FileMode, _ bool) writer { +func newSpecificWriter(_ *FSTree) writer { return nil } diff --git a/pkg/local_object_storage/blobstor/fstree/option.go b/pkg/local_object_storage/blobstor/fstree/option.go index 07e5474445..c16f7b190f 100644 --- a/pkg/local_object_storage/blobstor/fstree/option.go +++ b/pkg/local_object_storage/blobstor/fstree/option.go @@ -2,6 +2,7 @@ package fstree import ( "io/fs" + "time" ) type Option func(*FSTree) @@ -35,3 +36,27 @@ func WithNoSync(noSync bool) Option { f.noSync = noSync } } + +func WithCombinedCountLimit(limit int) Option { + return func(f *FSTree) { + f.combinedCountLimit = limit + } +} + +func WithCombinedSizeLimit(size int) Option { + return func(f *FSTree) { + f.combinedSizeLimit = size + } +} + +func WithCombinedSizeThreshold(size int) Option { + return func(f *FSTree) { + f.combinedSizeThreshold = size + } +} + +func WithCombinedWriteInterval(t time.Duration) Option { + return func(f *FSTree) { + f.combinedWriteInterval = t + } +} From 659c3b381e80599d23caaee1431d3b3f7c684373 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Tue, 27 Aug 2024 22:19:22 +0300 Subject: [PATCH 10/12] blobstore: add minimalistic Get benchmark It tries to push data with 100 threads which is close to the default maximum combining count. Then reads are done with as many threads as needed trying to read various objects. Unfortunately, size is not a real object size here, it can't be because one can push any garbage to the store with Put, but Get decodes input data and we need something that looks like an object, so size is a payload size here. Signed-off-by: Roman Khimov --- .../blobstor/bench_test.go | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/pkg/local_object_storage/blobstor/bench_test.go b/pkg/local_object_storage/blobstor/bench_test.go index a8ce636dea..2c76c0d844 100644 --- a/pkg/local_object_storage/blobstor/bench_test.go +++ b/pkg/local_object_storage/blobstor/bench_test.go @@ -11,6 +11,9 @@ import ( "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/fstree" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/peapod" + cid "github.com/nspcc-dev/neofs-sdk-go/container/id" + "github.com/nspcc-dev/neofs-sdk-go/object" + oid "github.com/nspcc-dev/neofs-sdk-go/object/id" oidtest "github.com/nspcc-dev/neofs-sdk-go/object/id/test" "github.com/stretchr/testify/require" ) @@ -94,3 +97,86 @@ func BenchmarkPut(b *testing.B) { }) } } + +func BenchmarkGet(b *testing.B) { + const nObjects = 10000 + + for _, tc := range []struct { + objSize uint64 + nThreads int + }{ + {1, 1}, + {1, 20}, + {1, 100}, + {1 << 10, 1}, + {1 << 10, 20}, + {1 << 10, 100}, + {100 << 10, 1}, + {100 << 10, 20}, + {100 << 10, 100}, + } { + b.Run(fmt.Sprintf("size=%d,thread=%d", tc.objSize, tc.nThreads), func(b *testing.B) { + for name, creat := range map[string]func(testing.TB) common.Storage{ + "peapod": newTestPeapod, + "fstree": newTestFSTree, + } { + b.Run(name, func(b *testing.B) { + var objs = make([]oid.Address, 0, nObjects) + + ptt := creat(b) + require.NoError(b, ptt.Open(false)) + require.NoError(b, ptt.Init()) + b.Cleanup(func() { _ = ptt.Close() }) + + obj := object.New() + data := make([]byte, tc.objSize) + rand.Read(data) + obj.SetID(oid.ID{1, 2, 3}) + obj.SetContainerID(cid.ID{1, 2, 3}) + obj.SetPayload(data) + + prm := common.PutPrm{ + RawData: obj.Marshal(), + } + + var ach = make(chan oid.Address) + for i := 0; i < 100; i++ { + go func() { + for j := 0; j < nObjects/100; j++ { + prm := prm + + prm.Address = oidtest.Address() + + _, err := ptt.Put(prm) + require.NoError(b, err) + ach <- prm.Address + } + }() + } + for i := 0; i < nObjects; i++ { + a := <-ach + objs = append(objs, a) + } + + b.ResetTimer() + for n := 0; n < b.N; n++ { + var wg sync.WaitGroup + + for i := 0; i < tc.nThreads; i++ { + wg.Add(1) + go func(ind int) { + defer wg.Done() + + var prm = common.GetPrm{Address: objs[nObjects/tc.nThreads*ind+n%(nObjects/tc.nThreads)]} + _, err := ptt.Get(prm) + require.NoError(b, err) + }(i) + } + + wg.Wait() + } + }) + } + }) + } +} From 391cfb418de11a9f8b868df274d84e93025ed015 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Tue, 27 Aug 2024 23:08:28 +0300 Subject: [PATCH 11/12] fstree: deduplicate iterate code further It does the same thing. Signed-off-by: Roman Khimov --- pkg/local_object_storage/blobstor/fstree/fstree.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index 325b4811b8..7dac161e8f 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -168,15 +168,12 @@ func (t *FSTree) iterate(depth uint64, curPath []string, prm common.IteratePrm) } else { var data []byte p := filepath.Join(curPath...) - data, err = os.ReadFile(p) - if err != nil && errors.Is(err, fs.ErrNotExist) { + data, err = getRawObjectBytes(addr.Object(), p) + if err != nil && errors.Is(err, apistatus.ObjectNotFound{}) { continue } if err == nil { - data, err = extractCombinedObject(addr.Object(), data) - if err == nil { - data, err = t.Decompress(data) - } + data, err = t.Decompress(data) } if err != nil { if prm.IgnoreErrors { From 1b40ec4cbc6dd7a6475c064608317f791d30e51a Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Wed, 28 Aug 2024 16:00:31 +0300 Subject: [PATCH 12/12] fstree: minimize memory required for combined objects Keeping whole combined file in memory is excessive. Tests with the default write combining enabled. HDD, before the patch: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 5 3600 6-Core Processor BenchmarkGet/size=1,thread=1/fstree-12 33870 35109 ns/op 14054 B/op 26 allocs/op BenchmarkGet/size=1,thread=1/peapod-12 104599 11292 ns/op 1496 B/op 27 allocs/op BenchmarkGet/size=1,thread=20/peapod-12 9690 123428 ns/op 30027 B/op 591 allocs/op BenchmarkGet/size=1,thread=20/fstree-12 6931 170086 ns/op 280829 B/op 501 allocs/op BenchmarkGet/size=1,thread=100/peapod-12 1978 586932 ns/op 148056 B/op 2683 allocs/op BenchmarkGet/size=1,thread=100/fstree-12 1828 656545 ns/op 1404125 B/op 2501 allocs/op BenchmarkGet/size=1024,thread=1/peapod-12 95288 12732 ns/op 3664 B/op 38 allocs/op BenchmarkGet/size=1024,thread=1/fstree-12 16780 71580 ns/op 117472 B/op 26 allocs/op BenchmarkGet/size=1024,thread=20/peapod-12 7676 147320 ns/op 73223 B/op 770 allocs/op BenchmarkGet/size=1024,thread=20/fstree-12 2733 440093 ns/op 2349173 B/op 501 allocs/op BenchmarkGet/size=1024,thread=100/peapod-12 1666 711513 ns/op 366425 B/op 3877 allocs/op BenchmarkGet/size=1024,thread=100/fstree-12 439 2773494 ns/op 11746542 B/op 2510 allocs/op BenchmarkGet/size=102400,thread=1/peapod-12 15387 82956 ns/op 214422 B/op 30 allocs/op BenchmarkGet/size=102400,thread=1/fstree-12 898 1494701 ns/op 6583348 B/op 26 allocs/op BenchmarkGet/size=102400,thread=20/peapod-12 1015 1056701 ns/op 4287961 B/op 564 allocs/op BenchmarkGet/size=102400,thread=20/fstree-12 57 18260165 ns/op 124299921 B/op 501 allocs/op BenchmarkGet/size=102400,thread=100/peapod-12 199 6087298 ns/op 21442817 B/op 3066 allocs/op BenchmarkGet/size=102400,thread=100/fstree-12 12 91612104 ns/op 630612319 B/op 2501 allocs/op PASS ok github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor 282.428s HDD, after the patch: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 5 3600 6-Core Processor BenchmarkGet/size=1,thread=1/fstree-12 13465 91189 ns/op 1687 B/op 26 allocs/op BenchmarkGet/size=1,thread=1/peapod-12 107473 11447 ns/op 1516 B/op 30 allocs/op BenchmarkGet/size=1,thread=20/peapod-12 9819 122212 ns/op 29637 B/op 542 allocs/op BenchmarkGet/size=1,thread=20/fstree-12 4321 276909 ns/op 33456 B/op 501 allocs/op BenchmarkGet/size=1,thread=100/peapod-12 2032 582728 ns/op 147916 B/op 2666 allocs/op BenchmarkGet/size=1,thread=100/fstree-12 1224 983660 ns/op 167230 B/op 2501 allocs/op BenchmarkGet/size=1024,thread=1/peapod-12 93832 12741 ns/op 3676 B/op 39 allocs/op BenchmarkGet/size=1024,thread=1/fstree-12 11612 97948 ns/op 3776 B/op 26 allocs/op BenchmarkGet/size=1024,thread=20/peapod-12 8413 145797 ns/op 73244 B/op 772 allocs/op BenchmarkGet/size=1024,thread=20/fstree-12 4113 288727 ns/op 75216 B/op 501 allocs/op BenchmarkGet/size=1024,thread=100/peapod-12 1677 688383 ns/op 366354 B/op 3866 allocs/op BenchmarkGet/size=1024,thread=100/fstree-12 1131 1054335 ns/op 376035 B/op 2501 allocs/op BenchmarkGet/size=102400,thread=1/peapod-12 16148 79871 ns/op 214414 B/op 29 allocs/op BenchmarkGet/size=102400,thread=1/fstree-12 6381 183071 ns/op 214592 B/op 26 allocs/op BenchmarkGet/size=102400,thread=20/peapod-12 1029 1163020 ns/op 4288312 B/op 608 allocs/op BenchmarkGet/size=102400,thread=20/fstree-12 1545 774230 ns/op 4291601 B/op 501 allocs/op BenchmarkGet/size=102400,thread=100/peapod-12 204 5881155 ns/op 21448378 B/op 3629 allocs/op BenchmarkGet/size=102400,thread=100/fstree-12 286 4097583 ns/op 21458835 B/op 2513 allocs/op SSD, before the patch: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics BenchmarkGet/size=1,thread=1/peapod-16 206061 5598 ns/op 1550 B/op 34 allocs/op BenchmarkGet/size=1,thread=1/fstree-16 43969 24736 ns/op 14056 B/op 26 allocs/op BenchmarkGet/size=1,thread=20/peapod-16 8701 130963 ns/op 30784 B/op 684 allocs/op BenchmarkGet/size=1,thread=20/fstree-16 10915 109432 ns/op 280837 B/op 501 allocs/op BenchmarkGet/size=1,thread=100/peapod-16 1615 697317 ns/op 153837 B/op 3395 allocs/op BenchmarkGet/size=1,thread=100/fstree-16 2515 459902 ns/op 1404191 B/op 2502 allocs/op BenchmarkGet/size=1024,thread=1/peapod-16 135268 7757 ns/op 3658 B/op 37 allocs/op BenchmarkGet/size=1024,thread=1/fstree-16 24434 47873 ns/op 117472 B/op 26 allocs/op BenchmarkGet/size=1024,thread=20/peapod-16 8172 148323 ns/op 73263 B/op 773 allocs/op BenchmarkGet/size=1024,thread=20/fstree-16 4244 243463 ns/op 2349182 B/op 501 allocs/op BenchmarkGet/size=1024,thread=100/peapod-16 1370 843700 ns/op 366623 B/op 3873 allocs/op BenchmarkGet/size=1024,thread=100/fstree-16 606 1917891 ns/op 11746920 B/op 2514 allocs/op BenchmarkGet/size=102400,thread=1/fstree-16 769 1667689 ns/op 8439914 B/op 26 allocs/op BenchmarkGet/size=102400,thread=1/peapod-16 26174 41144 ns/op 214476 B/op 36 allocs/op BenchmarkGet/size=102400,thread=20/fstree-16 49 21872222 ns/op 169093205 B/op 501 allocs/op BenchmarkGet/size=102400,thread=20/peapod-16 1674 636594 ns/op 4289612 B/op 758 allocs/op BenchmarkGet/size=102400,thread=100/peapod-16 296 3582319 ns/op 21447448 B/op 3535 allocs/op BenchmarkGet/size=102400,thread=100/fstree-16 10 110982517 ns/op 841263811 B/op 2505 allocs/op PASS ok github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor 203.115s SSD, after the patch: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics BenchmarkGet/size=1,thread=1/fstree-16 26210 44577 ns/op 1687 B/op 26 allocs/op BenchmarkGet/size=1,thread=1/peapod-16 203316 5551 ns/op 1550 B/op 34 allocs/op BenchmarkGet/size=1,thread=20/peapod-16 8142 127892 ns/op 30745 B/op 679 allocs/op BenchmarkGet/size=1,thread=20/fstree-16 8155 141522 ns/op 33457 B/op 501 allocs/op BenchmarkGet/size=1,thread=100/fstree-16 2160 526853 ns/op 167233 B/op 2501 allocs/op BenchmarkGet/size=1,thread=100/peapod-16 1633 695693 ns/op 153747 B/op 3383 allocs/op BenchmarkGet/size=1024,thread=1/fstree-16 24176 51102 ns/op 3776 B/op 26 allocs/op BenchmarkGet/size=1024,thread=1/peapod-16 139054 7587 ns/op 3675 B/op 39 allocs/op BenchmarkGet/size=1024,thread=20/peapod-16 8098 145081 ns/op 72656 B/op 697 allocs/op BenchmarkGet/size=1024,thread=20/fstree-16 7908 152477 ns/op 75216 B/op 501 allocs/op BenchmarkGet/size=1024,thread=100/peapod-16 1339 807344 ns/op 365303 B/op 3704 allocs/op BenchmarkGet/size=1024,thread=100/fstree-16 1930 578930 ns/op 376032 B/op 2501 allocs/op BenchmarkGet/size=102400,thread=1/peapod-16 22363 49137 ns/op 214402 B/op 28 allocs/op BenchmarkGet/size=102400,thread=1/fstree-16 8257 143084 ns/op 214592 B/op 26 allocs/op BenchmarkGet/size=102400,thread=20/peapod-16 1596 654267 ns/op 4289593 B/op 753 allocs/op BenchmarkGet/size=102400,thread=20/fstree-16 2222 507259 ns/op 4291623 B/op 501 allocs/op BenchmarkGet/size=102400,thread=100/peapod-16 286 3655491 ns/op 21447877 B/op 3592 allocs/op BenchmarkGet/size=102400,thread=100/fstree-16 415 2938832 ns/op 21459193 B/op 2517 allocs/op PASS ok github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor 213.832s Notice that peapod degrades more when multi-threaded. It still wins for super-small objects with low thread number, but that's the simplest scenario and this case can be improved for FSTree as well. Also, SSDs work better without write combining as we know from previous results, non-combined reads work like this for SSDs: goos: linux goarch: amd64 pkg: github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor cpu: AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics BenchmarkGet/size=1,thread=1/peapod-16 150696 6881 ns/op 1552 B/op 35 allocs/op BenchmarkGet/size=1,thread=1/fstree-16 60586 17133 ns/op 2279 B/op 26 allocs/op BenchmarkGet/size=1,thread=20/peapod-16 9412 127615 ns/op 30750 B/op 679 allocs/op BenchmarkGet/size=1,thread=20/fstree-16 19616 60384 ns/op 45298 B/op 501 allocs/op BenchmarkGet/size=1,thread=100/fstree-16 5278 203212 ns/op 226452 B/op 2501 allocs/op BenchmarkGet/size=1,thread=100/peapod-16 1600 691861 ns/op 153755 B/op 3385 allocs/op BenchmarkGet/size=1024,thread=1/peapod-16 148694 8222 ns/op 3677 B/op 39 allocs/op BenchmarkGet/size=1024,thread=1/fstree-16 55369 19654 ns/op 3936 B/op 26 allocs/op BenchmarkGet/size=1024,thread=20/peapod-16 7117 157147 ns/op 73252 B/op 772 allocs/op BenchmarkGet/size=1024,thread=20/fstree-16 17496 71552 ns/op 78417 B/op 501 allocs/op BenchmarkGet/size=1024,thread=100/peapod-16 1346 815546 ns/op 364150 B/op 3564 allocs/op BenchmarkGet/size=1024,thread=100/fstree-16 4941 242115 ns/op 392055 B/op 2501 allocs/op BenchmarkGet/size=102400,thread=1/peapod-16 23816 43648 ns/op 214439 B/op 31 allocs/op BenchmarkGet/size=102400,thread=1/fstree-16 16125 72096 ns/op 214752 B/op 26 allocs/op BenchmarkGet/size=102400,thread=20/peapod-16 1748 658507 ns/op 4288086 B/op 580 allocs/op BenchmarkGet/size=102400,thread=20/fstree-16 1838 617794 ns/op 4294815 B/op 501 allocs/op BenchmarkGet/size=102400,thread=100/peapod-16 284 3732700 ns/op 21441371 B/op 2795 allocs/op BenchmarkGet/size=102400,thread=100/fstree-16 438 2749439 ns/op 21475282 B/op 2518 allocs/op PASS ok github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor 172.643s Signed-off-by: Roman Khimov --- .../blobstor/fstree/fstree.go | 69 ++++++++++++++----- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index 7dac161e8f..1593af9511 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -6,7 +6,9 @@ import ( "encoding/binary" "errors" "fmt" + "io" "io/fs" + "math" "os" "path/filepath" "strings" @@ -330,14 +332,15 @@ func (t *FSTree) getObjBytes(addr oid.Address) ([]byte, error) { // getRawObjectBytes extracts raw object bytes from the storage by path. No // decompression is performed. func getRawObjectBytes(id oid.ID, p string) ([]byte, error) { - data, err := os.ReadFile(p) + f, err := os.Open(p) if err != nil { if errors.Is(err, fs.ErrNotExist) { return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) } return nil, fmt.Errorf("read file %q: %w", p, err) } - data, err = extractCombinedObject(id, data) + defer f.Close() + data, err := extractCombinedObject(id, f) if err != nil { if errors.Is(err, fs.ErrNotExist) { return nil, logicerr.Wrap(apistatus.ObjectNotFound{}) @@ -347,7 +350,7 @@ func getRawObjectBytes(id oid.ID, p string) ([]byte, error) { return data, nil } -func extractCombinedObject(id oid.ID, data []byte) ([]byte, error) { +func extractCombinedObject(id oid.ID, f *os.File) ([]byte, error) { const ( prefixSize = 1 idSize = sha256.Size @@ -358,25 +361,55 @@ func extractCombinedObject(id oid.ID, data []byte) ([]byte, error) { dataOff = lengthOff + lengthSize ) - var notFound bool + var ( + comBuf [dataOff]byte + data []byte + isCombined bool + ) - for len(data) > dataOff && data[0] == combinedPrefix { - notFound = true // The file _is_ combined, so the object _must_ be there. - var l = binary.BigEndian.Uint32(data[lengthOff:dataOff]) - if bytes.Equal(data[idOff:lengthOff], id[:]) { - data = data[dataOff : dataOff+int(l)] - notFound = false - break + for { + n, err := io.ReadFull(f, comBuf[:]) + if err != nil { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + if !isCombined { + return comBuf[:n], nil + } + return nil, fs.ErrNotExist + } + return nil, err + } + if comBuf[0] != combinedPrefix { + st, err := f.Stat() + if err != nil { + return nil, err + } + sz := st.Size() + if sz > math.MaxInt { + return nil, errors.New("too large file") + } + data = make([]byte, int(sz)) + copy(data, comBuf[:]) + _, err = io.ReadFull(f, data[len(comBuf):]) + if err != nil { + return nil, err + } + return data, nil + } + isCombined = true + var l = binary.BigEndian.Uint32(comBuf[lengthOff:dataOff]) + if bytes.Equal(comBuf[idOff:lengthOff], id[:]) { + data = make([]byte, l) + _, err = io.ReadFull(f, data) + if err != nil { + return nil, err + } + return data, nil } - if len(data) < dataOff+int(l) { - break + _, err = f.Seek(int64(l), 1) + if err != nil { + return nil, err } - data = data[dataOff+int(l):] } - if notFound { - return nil, fs.ErrNotExist // Quite similar in meaning. - } - return data, nil } // GetRange implements common.Storage.