From 15fbb9d7f2a0a00fb5e11e38fc3529167cc066a2 Mon Sep 17 00:00:00 2001 From: lash Date: Mon, 24 Feb 2020 06:41:24 +0100 Subject: [PATCH] file, testutil: Add reference file hasher (#2099) --- file/hasher/common_test.go | 66 ++++++++++++++++ file/hasher/hasher.go | 2 +- file/hasher/hasher_test.go | 2 +- file/hasher/param.go | 56 +++++++++++++ file/hasher/reference.go | 145 ++++++++++++++++++++++++++++++++++ file/hasher/reference_test.go | 140 ++++++++++++++++++++++++++++++++ file/hasher/util.go | 31 ++++++++ file/hasher/util_test.go | 17 ++++ testutil/data.go | 15 ++++ 9 files changed, 472 insertions(+), 2 deletions(-) create mode 100644 file/hasher/common_test.go create mode 100644 file/hasher/param.go create mode 100644 file/hasher/reference.go create mode 100644 file/hasher/reference_test.go create mode 100644 file/hasher/util.go create mode 100644 file/hasher/util_test.go create mode 100644 testutil/data.go diff --git a/file/hasher/common_test.go b/file/hasher/common_test.go new file mode 100644 index 0000000000..bad3556420 --- /dev/null +++ b/file/hasher/common_test.go @@ -0,0 +1,66 @@ +package hasher + +import ( + "github.com/ethersphere/swarm/testutil" +) + +const ( + sectionSize = 32 + branches = 128 + chunkSize = 4096 +) + +var ( + dataLengths = []int{31, // 0 + 32, // 1 + 33, // 2 + 63, // 3 + 64, // 4 + 65, // 5 + chunkSize, // 6 + chunkSize + 31, // 7 + chunkSize + 32, // 8 + chunkSize + 63, // 9 + chunkSize + 64, // 10 + chunkSize * 2, // 11 + chunkSize*2 + 32, // 12 + chunkSize * 128, // 13 + chunkSize*128 + 31, // 14 + chunkSize*128 + 32, // 15 + chunkSize*128 + 64, // 16 + chunkSize * 129, // 17 + chunkSize * 130, // 18 + chunkSize * 128 * 128, // 19 + chunkSize*128*128 + 32, // 20 + } + expected = []string{ + "ece86edb20669cc60d142789d464d57bdf5e33cb789d443f608cbd81cfa5697d", // 0 + "0be77f0bb7abc9cd0abed640ee29849a3072ccfd1020019fe03658c38f087e02", // 1 + "3463b46d4f9d5bfcbf9a23224d635e51896c1daef7d225b86679db17c5fd868e", // 2 + "95510c2ff18276ed94be2160aed4e69c9116573b6f69faaeed1b426fea6a3db8", // 3 + "490072cc55b8ad381335ff882ac51303cc069cbcb8d8d3f7aa152d9c617829fe", // 4 + "541552bae05e9a63a6cb561f69edf36ffe073e441667dbf7a0e9a3864bb744ea", // 5 + "c10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef", // 6 + "91699c83ed93a1f87e326a29ccd8cc775323f9e7260035a5f014c975c5f3cd28", // 7 + "73759673a52c1f1707cbb61337645f4fcbd209cdc53d7e2cedaaa9f44df61285", // 8 + "db1313a727ffc184ae52a70012fbbf7235f551b9f2d2da04bf476abe42a3cb42", // 9 + "ade7af36ac0c7297dc1c11fd7b46981b629c6077bce75300f85b02a6153f161b", // 10 + "29a5fb121ce96194ba8b7b823a1f9c6af87e1791f824940a53b5a7efe3f790d9", // 11 + "61416726988f77b874435bdd89a419edc3861111884fd60e8adf54e2f299efd6", // 12 + "3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09", // 13 + "e5c76afa931e33ac94bce2e754b1bb6407d07f738f67856783d93934ca8fc576", // 14 + "485a526fc74c8a344c43a4545a5987d17af9ab401c0ef1ef63aefcc5c2c086df", // 15 + "624b2abb7aefc0978f891b2a56b665513480e5dc195b4a66cd8def074a6d2e94", // 16 + "b8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199", // 17 + "59de730bf6c67a941f3b2ffa2f920acfaa1713695ad5deea12b4a121e5f23fa1", // 18 + "522194562123473dcfd7a457b18ee7dee8b7db70ed3cfa2b73f348a992fdfd3b", // 19 + "ed0cc44c93b14fef2d91ab3a3674eeb6352a42ac2f0bbe524711824aae1e7bcc", // 20 + } + + start = 0 + end = len(dataLengths) +) + +func init() { + testutil.Init() +} diff --git a/file/hasher/hasher.go b/file/hasher/hasher.go index 9478fb79b8..5cebba192f 100644 --- a/file/hasher/hasher.go +++ b/file/hasher/hasher.go @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Lesser General Public License // along with the Swarm library. If not, see . -package file +package hasher import ( "context" diff --git a/file/hasher/hasher_test.go b/file/hasher/hasher_test.go index babb981ef3..91ca296d81 100644 --- a/file/hasher/hasher_test.go +++ b/file/hasher/hasher_test.go @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Lesser General Public License // along with the Swarm library. If not, see . -package file +package hasher import ( "bytes" diff --git a/file/hasher/param.go b/file/hasher/param.go new file mode 100644 index 0000000000..6de12f1065 --- /dev/null +++ b/file/hasher/param.go @@ -0,0 +1,56 @@ +package hasher + +import ( + "context" + "sync" + + "github.com/ethersphere/swarm/file" +) + +// defines the boundaries of the hashing job and also contains the hash factory function of the job +// setting Debug means omitting any automatic behavior (for now it means job processing won't auto-start) +type treeParams struct { + SectionSize int + Branches int + ChunkSize int + Spans []int + Debug bool + hashFunc file.SectionWriterFunc + writerPool sync.Pool + ctx context.Context +} + +func newTreeParams(hashFunc file.SectionWriterFunc) *treeParams { + + h := hashFunc(context.Background()) + p := &treeParams{ + SectionSize: h.SectionSize(), + Branches: h.Branches(), + ChunkSize: h.SectionSize() * h.Branches(), + hashFunc: hashFunc, + } + h.Reset() + p.writerPool.New = func() interface{} { + hf := p.hashFunc(p.ctx) + return hf + } + p.Spans = generateSpanSizes(p.Branches, 9) + return p +} + +func (p *treeParams) SetContext(ctx context.Context) { + p.ctx = ctx +} + +func (p *treeParams) GetContext() context.Context { + return p.ctx +} + +func (p *treeParams) PutWriter(w file.SectionWriter) { + w.Reset() + p.writerPool.Put(w) +} + +func (p *treeParams) GetWriter() file.SectionWriter { + return p.writerPool.Get().(file.SectionWriter) +} diff --git a/file/hasher/reference.go b/file/hasher/reference.go new file mode 100644 index 0000000000..0ceb570ee8 --- /dev/null +++ b/file/hasher/reference.go @@ -0,0 +1,145 @@ +package hasher + +import ( + "github.com/ethersphere/swarm/file" +) + +// ReferenceHasher is the source-of-truth implementation of the swarm file hashing algorithm +type ReferenceHasher struct { + params *treeParams + cursors []int // section write position, indexed per level + length int // number of bytes written to the data level of the hasher + buffer []byte // keeps data and hashes, indexed by cursors + counts []int // number of sums performed, indexed per level + hasher file.SectionWriter // underlying hasher +} + +// NewReferenceHasher constructs and returns a new ReferenceHasher +// This implementation is limited to a tree of 9 levels, where level 0 is the data level +// With 32 section size and 128 branches (i.e. unencrypted, non erasure-coded content) this means +// a capacity of 4096 bytes * (128^(9-1)) ~ 295.148 * (10^18) bytes +func NewReferenceHasher(params *treeParams) *ReferenceHasher { + // TODO: remove when bmt interface is amended + h := params.GetWriter() + return &ReferenceHasher{ + params: params, + cursors: make([]int, 9), + counts: make([]int, 9), + buffer: make([]byte, params.ChunkSize*9), + hasher: h, + } +} + +// Hash computes and returns the root hash of arbitrary data +func (r *ReferenceHasher) Hash(data []byte) []byte { + l := r.params.ChunkSize + for i := 0; i < len(data); i += r.params.ChunkSize { + if len(data)-i < r.params.ChunkSize { + l = len(data) - i + } + r.update(0, data[i:i+l]) + } + + // if we didn't end on a chunk boundary we need to hash remaining chunks first + r.hashUnfinished() + + // if the already hashed parts tree is balanced + r.moveDanglingChunk() + + return r.digest() +} + +// write to the data buffer on the specified level +// calls sum if chunk boundary is reached and recursively calls this function for the next level with the acquired bmt hash +// adjusts cursors accordingly +func (r *ReferenceHasher) update(lvl int, data []byte) { + if lvl == 0 { + r.length += len(data) + } + copy(r.buffer[r.cursors[lvl]:r.cursors[lvl]+len(data)], data) + r.cursors[lvl] += len(data) + if r.cursors[lvl]-r.cursors[lvl+1] == r.params.ChunkSize { + ref := r.sum(lvl) + r.update(lvl+1, ref) + r.cursors[lvl] = r.cursors[lvl+1] + } +} + +// calculates and returns the bmt sum of the last written data on the level +func (r *ReferenceHasher) sum(lvl int) []byte { + r.counts[lvl]++ + spanSize := r.params.Spans[lvl] * r.params.ChunkSize + span := (r.length-1)%spanSize + 1 + + sizeToSum := r.cursors[lvl] - r.cursors[lvl+1] + + r.hasher.Reset() + r.hasher.SetSpan(span) + r.hasher.Write(r.buffer[r.cursors[lvl+1] : r.cursors[lvl+1]+sizeToSum]) + ref := r.hasher.Sum(nil) + return ref +} + +// called after all data has been written +// sums the final chunks of each level +// skips intermediate levels that end on span boundary +func (r *ReferenceHasher) digest() []byte { + + // the first section of the buffer will hold the root hash + return r.buffer[:r.params.SectionSize] +} + +// hashes the remaining unhashed chunks at the end of each level +func (r *ReferenceHasher) hashUnfinished() { + if r.length%r.params.ChunkSize != 0 { + ref := r.sum(0) + copy(r.buffer[r.cursors[1]:], ref) + r.cursors[1] += len(ref) + r.cursors[0] = r.cursors[1] + } +} + +// in case of a balanced tree this method concatenates the reference to the single reference +// at the highest level of the tree. +// +// Let F be full chunks (disregarding branching factor) and S be single references +// in the following scenario: +// +// S +// F F +// F F F +// F F F F S +// +// The result will be: +// +// SS +// F F +// F F F +// F F F F +// +// After which the SS will be hashed to obtain the final root hash +func (r *ReferenceHasher) moveDanglingChunk() { + + // calculate the total number of levels needed to represent the data (including the data level) + targetLevel := getLevelsFromLength(r.length, r.params.SectionSize, r.params.Branches) + + // sum every intermediate level and write to the level above it + for i := 1; i < targetLevel; i++ { + + // and if there is a single reference outside a balanced tree on this level + // don't hash it again but pass it on to the next level + if r.counts[i] > 0 { + // TODO: simplify if possible + if r.counts[i-1]-r.params.Spans[targetLevel-1-i] <= 1 { + r.cursors[i+1] = r.cursors[i] + r.cursors[i] = r.cursors[i-1] + continue + } + } + + ref := r.sum(i) + copy(r.buffer[r.cursors[i+1]:], ref) + r.cursors[i+1] += len(ref) + r.cursors[i] = r.cursors[i+1] + } +} diff --git a/file/hasher/reference_test.go b/file/hasher/reference_test.go new file mode 100644 index 0000000000..d4deef5c0b --- /dev/null +++ b/file/hasher/reference_test.go @@ -0,0 +1,140 @@ +package hasher + +import ( + "context" + "fmt" + "strconv" + "strings" + "testing" + + "github.com/ethereum/go-ethereum/common/hexutil" + "github.com/ethersphere/swarm/bmt" + "github.com/ethersphere/swarm/file" + "github.com/ethersphere/swarm/log" + "github.com/ethersphere/swarm/testutil" + "golang.org/x/crypto/sha3" +) + +// TestManualDanglingChunk is a test script explicitly hashing and writing every individual level in the dangling chunk edge case +// we use a balanced tree with data size of chunkSize*branches, and a single chunk of data +// this case is chosen because it produces the wrong result in the pyramid hasher at the time of writing (master commit hash 4928d989ebd0854d993c10c194e61a5a5455e4f9) +func TestManualDanglingChunk(t *testing.T) { + pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize) + h := bmt.New(pool) + + // to execute the job we need buffers with the following capacities: + // level 0: chunkSize*branches+chunkSize + // level 1: chunkSize + // level 2: sectionSize * 2 + var levels [][]byte + levels = append(levels, nil) + levels = append(levels, make([]byte, chunkSize)) + levels = append(levels, make([]byte, sectionSize*2)) + + // hash the balanced tree portion of the data level and write to level 1 + _, levels[0] = testutil.SerialData(chunkSize*branches+chunkSize, 255, 0) + for i := 0; i < chunkSize*branches; i += chunkSize { + h.Reset() + h.SetSpan(chunkSize) + h.Write(levels[0][i : i+chunkSize]) + copy(levels[1][i/branches:], h.Sum(nil)) + } + refHex := hexutil.Encode(levels[1][:sectionSize]) + correctRefHex := "0xc10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef" + if refHex != correctRefHex { + t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex) + } + + // write the dangling chunk + // hash it and write the reference on the second section of level 2 + h.Reset() + h.SetSpan(chunkSize) + h.Write(levels[0][chunkSize*branches:]) + copy(levels[2][sectionSize:], h.Sum(nil)) + refHex = hexutil.Encode(levels[2][sectionSize:]) + correctRefHex = "0x81b31d9a7f6c377523e8769db021091df23edd9fd7bd6bcdf11a22f518db6006" + if refHex != correctRefHex { + t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex) + } + + // hash the chunk on level 1 and write into the first section of level 2 + h.Reset() + h.SetSpan(chunkSize * branches) + h.Write(levels[1]) + copy(levels[2], h.Sum(nil)) + refHex = hexutil.Encode(levels[2][:sectionSize]) + correctRefHex = "0x3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09" + if refHex != correctRefHex { + t.Fatalf("manual dangling balanced tree; expected %s, got %s", correctRefHex, refHex) + } + + // hash the two sections on level 2 to obtain the root hash + h.Reset() + h.SetSpan(chunkSize*branches + chunkSize) + h.Write(levels[2]) + ref := h.Sum(nil) + refHex = hexutil.Encode(ref) + correctRefHex = "0xb8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199" + if refHex != correctRefHex { + t.Fatalf("manual dangling root; expected %s, got %s", correctRefHex, refHex) + } +} + +// TestReferenceFileHasherVector executes the file hasher algorithms on serial input data of periods of 0-254 +// of lengths defined in common_test.go +// +// the "expected" array in common_test.go is generated by this implementation, and test failure due to +// result mismatch is nothing else than an indication that something has changed in the reference filehasher +// or the underlying hashing algorithm +func TestReferenceHasherVector(t *testing.T) { + + hashFunc := func(_ context.Context) file.SectionWriter { + pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize) + return bmt.New(pool) + } + params := newTreeParams(hashFunc) + var mismatch int + for i := start; i < end; i++ { + dataLength := dataLengths[i] + log.Info("start", "i", i, "len", dataLength) + rh := NewReferenceHasher(params) + _, data := testutil.SerialData(dataLength, 255, 0) + refHash := rh.Hash(data) + eq := true + if expected[i] != fmt.Sprintf("%x", refHash) { + mismatch++ + eq = false + } + t.Logf("[%7d+%4d]\t%v\tref: %x\texpect: %s", dataLength/chunkSize, dataLength%chunkSize, eq, refHash, expected[i]) + } + if mismatch > 0 { + t.Fatalf("mismatches: %d/%d", mismatch, end-start) + } +} + +// BenchmarkReferenceHasher establishes a baseline for a fully synchronous file hashing operation +// it will be vastly inefficient +func BenchmarkReferenceHasher(b *testing.B) { + for i := start; i < end; i++ { + b.Run(fmt.Sprintf("%d", dataLengths[i]), benchmarkReferenceHasher) + } +} + +func benchmarkReferenceHasher(b *testing.B) { + benchParams := strings.Split(b.Name(), "/") + dataLength, err := strconv.ParseInt(benchParams[1], 10, 64) + if err != nil { + b.Fatal(err) + } + hashFunc := func(_ context.Context) file.SectionWriter { + pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize) + return bmt.New(pool) + } + params := newTreeParams(hashFunc) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, data := testutil.SerialData(int(dataLength), 255, 0) + fh := NewReferenceHasher(params) + fh.Hash(data) + } +} diff --git a/file/hasher/util.go b/file/hasher/util.go new file mode 100644 index 0000000000..141fd1d114 --- /dev/null +++ b/file/hasher/util.go @@ -0,0 +1,31 @@ +package hasher + +import ( + "math" +) + +// TODO: level 0 should be SectionSize() not Branches() +// generates a dictionary of maximum span lengths per level represented by one SectionSize() of data +func generateSpanSizes(branches int, levels int) []int { + spans := make([]int, levels) + span := 1 + for i := 0; i < 9; i++ { + spans[i] = span + span *= branches + } + return spans +} + +// TODO: use params instead of sectionSize, branches +// calculate the last level index which a particular data section count will result in. +// the returned level will be the level of the root hash +func getLevelsFromLength(l int, sectionSize int, branches int) int { + if l == 0 { + return 0 + } else if l <= sectionSize*branches { + return 1 + } + c := (l - 1) / (sectionSize) + + return int(math.Log(float64(c))/math.Log(float64(branches)) + 1) +} diff --git a/file/hasher/util_test.go b/file/hasher/util_test.go new file mode 100644 index 0000000000..51640e4ad5 --- /dev/null +++ b/file/hasher/util_test.go @@ -0,0 +1,17 @@ +package hasher + +import "testing" + +// TestLevelsFromLength verifies getLevelsFromLength +func TestLevelsFromLength(t *testing.T) { + + sizes := []int{sectionSize, chunkSize, chunkSize + sectionSize, chunkSize * branches, chunkSize*branches + 1} + expects := []int{1, 1, 2, 2, 3} + + for i, size := range sizes { + lvl := getLevelsFromLength(size, sectionSize, branches) + if expects[i] != lvl { + t.Fatalf("size %d, expected %d, got %d", size, expects[i], lvl) + } + } +} diff --git a/testutil/data.go b/testutil/data.go new file mode 100644 index 0000000000..f3bea59e91 --- /dev/null +++ b/testutil/data.go @@ -0,0 +1,15 @@ +package testutil + +import ( + "bytes" + "io" +) + +func SerialData(l int, mod int, offset int) (r io.Reader, slice []byte) { + slice = make([]byte, l) + for i := 0; i < len(slice); i++ { + slice[i] = byte((i + offset) % mod) + } + r = io.LimitReader(bytes.NewReader(slice), int64(l)) + return +}