From 15fbb9d7f2a0a00fb5e11e38fc3529167cc066a2 Mon Sep 17 00:00:00 2001
From: lash <nolash@users.noreply.github.com>
Date: Mon, 24 Feb 2020 06:41:24 +0100
Subject: [PATCH] file, testutil: Add reference file hasher (#2099)

---
 file/hasher/common_test.go    |  66 ++++++++++++++++
 file/hasher/hasher.go         |   2 +-
 file/hasher/hasher_test.go    |   2 +-
 file/hasher/param.go          |  56 +++++++++++++
 file/hasher/reference.go      | 145 ++++++++++++++++++++++++++++++++++
 file/hasher/reference_test.go | 140 ++++++++++++++++++++++++++++++++
 file/hasher/util.go           |  31 ++++++++
 file/hasher/util_test.go      |  17 ++++
 testutil/data.go              |  15 ++++
 9 files changed, 472 insertions(+), 2 deletions(-)
 create mode 100644 file/hasher/common_test.go
 create mode 100644 file/hasher/param.go
 create mode 100644 file/hasher/reference.go
 create mode 100644 file/hasher/reference_test.go
 create mode 100644 file/hasher/util.go
 create mode 100644 file/hasher/util_test.go
 create mode 100644 testutil/data.go

diff --git a/file/hasher/common_test.go b/file/hasher/common_test.go
new file mode 100644
index 0000000000..bad3556420
--- /dev/null
+++ b/file/hasher/common_test.go
@@ -0,0 +1,66 @@
+package hasher
+
+import (
+	"github.com/ethersphere/swarm/testutil"
+)
+
+const (
+	sectionSize = 32
+	branches    = 128
+	chunkSize   = 4096
+)
+
+var (
+	dataLengths = []int{31, // 0
+		32,                     // 1
+		33,                     // 2
+		63,                     // 3
+		64,                     // 4
+		65,                     // 5
+		chunkSize,              // 6
+		chunkSize + 31,         // 7
+		chunkSize + 32,         // 8
+		chunkSize + 63,         // 9
+		chunkSize + 64,         // 10
+		chunkSize * 2,          // 11
+		chunkSize*2 + 32,       // 12
+		chunkSize * 128,        // 13
+		chunkSize*128 + 31,     // 14
+		chunkSize*128 + 32,     // 15
+		chunkSize*128 + 64,     // 16
+		chunkSize * 129,        // 17
+		chunkSize * 130,        // 18
+		chunkSize * 128 * 128,  // 19
+		chunkSize*128*128 + 32, // 20
+	}
+	expected = []string{
+		"ece86edb20669cc60d142789d464d57bdf5e33cb789d443f608cbd81cfa5697d", // 0
+		"0be77f0bb7abc9cd0abed640ee29849a3072ccfd1020019fe03658c38f087e02", // 1
+		"3463b46d4f9d5bfcbf9a23224d635e51896c1daef7d225b86679db17c5fd868e", // 2
+		"95510c2ff18276ed94be2160aed4e69c9116573b6f69faaeed1b426fea6a3db8", // 3
+		"490072cc55b8ad381335ff882ac51303cc069cbcb8d8d3f7aa152d9c617829fe", // 4
+		"541552bae05e9a63a6cb561f69edf36ffe073e441667dbf7a0e9a3864bb744ea", // 5
+		"c10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef", // 6
+		"91699c83ed93a1f87e326a29ccd8cc775323f9e7260035a5f014c975c5f3cd28", // 7
+		"73759673a52c1f1707cbb61337645f4fcbd209cdc53d7e2cedaaa9f44df61285", // 8
+		"db1313a727ffc184ae52a70012fbbf7235f551b9f2d2da04bf476abe42a3cb42", // 9
+		"ade7af36ac0c7297dc1c11fd7b46981b629c6077bce75300f85b02a6153f161b", // 10
+		"29a5fb121ce96194ba8b7b823a1f9c6af87e1791f824940a53b5a7efe3f790d9", // 11
+		"61416726988f77b874435bdd89a419edc3861111884fd60e8adf54e2f299efd6", // 12
+		"3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09", // 13
+		"e5c76afa931e33ac94bce2e754b1bb6407d07f738f67856783d93934ca8fc576", // 14
+		"485a526fc74c8a344c43a4545a5987d17af9ab401c0ef1ef63aefcc5c2c086df", // 15
+		"624b2abb7aefc0978f891b2a56b665513480e5dc195b4a66cd8def074a6d2e94", // 16
+		"b8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199", // 17
+		"59de730bf6c67a941f3b2ffa2f920acfaa1713695ad5deea12b4a121e5f23fa1", // 18
+		"522194562123473dcfd7a457b18ee7dee8b7db70ed3cfa2b73f348a992fdfd3b", // 19
+		"ed0cc44c93b14fef2d91ab3a3674eeb6352a42ac2f0bbe524711824aae1e7bcc", // 20
+	}
+
+	start = 0
+	end   = len(dataLengths)
+)
+
+func init() {
+	testutil.Init()
+}
diff --git a/file/hasher/hasher.go b/file/hasher/hasher.go
index 9478fb79b8..5cebba192f 100644
--- a/file/hasher/hasher.go
+++ b/file/hasher/hasher.go
@@ -14,7 +14,7 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the Swarm library. If not, see <http://www.gnu.org/licenses/>.
 
-package file
+package hasher
 
 import (
 	"context"
diff --git a/file/hasher/hasher_test.go b/file/hasher/hasher_test.go
index babb981ef3..91ca296d81 100644
--- a/file/hasher/hasher_test.go
+++ b/file/hasher/hasher_test.go
@@ -14,7 +14,7 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the Swarm library. If not, see <http://www.gnu.org/licenses/>.
 
-package file
+package hasher
 
 import (
 	"bytes"
diff --git a/file/hasher/param.go b/file/hasher/param.go
new file mode 100644
index 0000000000..6de12f1065
--- /dev/null
+++ b/file/hasher/param.go
@@ -0,0 +1,56 @@
+package hasher
+
+import (
+	"context"
+	"sync"
+
+	"github.com/ethersphere/swarm/file"
+)
+
+// defines the boundaries of the hashing job and also contains the hash factory function of the job
+// setting Debug means omitting any automatic behavior (for now it means job processing won't auto-start)
+type treeParams struct {
+	SectionSize int
+	Branches    int
+	ChunkSize   int
+	Spans       []int
+	Debug       bool
+	hashFunc    file.SectionWriterFunc
+	writerPool  sync.Pool
+	ctx         context.Context
+}
+
+func newTreeParams(hashFunc file.SectionWriterFunc) *treeParams {
+
+	h := hashFunc(context.Background())
+	p := &treeParams{
+		SectionSize: h.SectionSize(),
+		Branches:    h.Branches(),
+		ChunkSize:   h.SectionSize() * h.Branches(),
+		hashFunc:    hashFunc,
+	}
+	h.Reset()
+	p.writerPool.New = func() interface{} {
+		hf := p.hashFunc(p.ctx)
+		return hf
+	}
+	p.Spans = generateSpanSizes(p.Branches, 9)
+	return p
+}
+
+func (p *treeParams) SetContext(ctx context.Context) {
+	p.ctx = ctx
+}
+
+func (p *treeParams) GetContext() context.Context {
+	return p.ctx
+}
+
+func (p *treeParams) PutWriter(w file.SectionWriter) {
+	w.Reset()
+	p.writerPool.Put(w)
+}
+
+func (p *treeParams) GetWriter() file.SectionWriter {
+	return p.writerPool.Get().(file.SectionWriter)
+}
diff --git a/file/hasher/reference.go b/file/hasher/reference.go
new file mode 100644
index 0000000000..0ceb570ee8
--- /dev/null
+++ b/file/hasher/reference.go
@@ -0,0 +1,145 @@
+package hasher
+
+import (
+	"github.com/ethersphere/swarm/file"
+)
+
+// ReferenceHasher is the source-of-truth implementation of the swarm file hashing algorithm
+type ReferenceHasher struct {
+	params  *treeParams
+	cursors []int              // section write position, indexed per level
+	length  int                // number of bytes written to the data level of the hasher
+	buffer  []byte             // keeps data and hashes, indexed by cursors
+	counts  []int              // number of sums performed, indexed per level
+	hasher  file.SectionWriter // underlying hasher
+}
+
+// NewReferenceHasher constructs and returns a new ReferenceHasher
+// This implementation is limited to a tree of 9 levels, where level 0 is the data level
+// With 32 section size and 128 branches (i.e. unencrypted, non erasure-coded content) this means
+// a capacity of 4096 bytes * (128^(9-1)) ~ 295.148 * (10^18) bytes
+func NewReferenceHasher(params *treeParams) *ReferenceHasher {
+	// TODO: remove when bmt interface is amended
+	h := params.GetWriter()
+	return &ReferenceHasher{
+		params:  params,
+		cursors: make([]int, 9),
+		counts:  make([]int, 9),
+		buffer:  make([]byte, params.ChunkSize*9),
+		hasher:  h,
+	}
+}
+
+// Hash computes and returns the root hash of arbitrary data
+func (r *ReferenceHasher) Hash(data []byte) []byte {
+	l := r.params.ChunkSize
+	for i := 0; i < len(data); i += r.params.ChunkSize {
+		if len(data)-i < r.params.ChunkSize {
+			l = len(data) - i
+		}
+		r.update(0, data[i:i+l])
+	}
+
+	// if we didn't end on a chunk boundary we need to hash remaining chunks first
+	r.hashUnfinished()
+
+	// if the already hashed parts tree is balanced
+	r.moveDanglingChunk()
+
+	return r.digest()
+}
+
+// write to the data buffer on the specified level
+// calls sum if chunk boundary is reached and recursively calls this function for the next level with the acquired bmt hash
+// adjusts cursors accordingly
+func (r *ReferenceHasher) update(lvl int, data []byte) {
+	if lvl == 0 {
+		r.length += len(data)
+	}
+	copy(r.buffer[r.cursors[lvl]:r.cursors[lvl]+len(data)], data)
+	r.cursors[lvl] += len(data)
+	if r.cursors[lvl]-r.cursors[lvl+1] == r.params.ChunkSize {
+		ref := r.sum(lvl)
+		r.update(lvl+1, ref)
+		r.cursors[lvl] = r.cursors[lvl+1]
+	}
+}
+
+// calculates and returns the bmt sum of the last written data on the level
+func (r *ReferenceHasher) sum(lvl int) []byte {
+	r.counts[lvl]++
+	spanSize := r.params.Spans[lvl] * r.params.ChunkSize
+	span := (r.length-1)%spanSize + 1
+
+	sizeToSum := r.cursors[lvl] - r.cursors[lvl+1]
+
+	r.hasher.Reset()
+	r.hasher.SetSpan(span)
+	r.hasher.Write(r.buffer[r.cursors[lvl+1] : r.cursors[lvl+1]+sizeToSum])
+	ref := r.hasher.Sum(nil)
+	return ref
+}
+
+// called after all data has been written
+// sums the final chunks of each level
+// skips intermediate levels that end on span boundary
+func (r *ReferenceHasher) digest() []byte {
+
+	// the first section of the buffer will hold the root hash
+	return r.buffer[:r.params.SectionSize]
+}
+
+// hashes the remaining unhashed chunks at the end of each level
+func (r *ReferenceHasher) hashUnfinished() {
+	if r.length%r.params.ChunkSize != 0 {
+		ref := r.sum(0)
+		copy(r.buffer[r.cursors[1]:], ref)
+		r.cursors[1] += len(ref)
+		r.cursors[0] = r.cursors[1]
+	}
+}
+
+// in case of a balanced tree this method concatenates the reference to the single reference
+// at the highest level of the tree.
+//
+// Let F be full chunks (disregarding branching factor) and S be single references
+// in the following scenario:
+//
+//       S
+//     F   F
+//   F   F   F
+// F   F   F   F S
+//
+// The result will be:
+//
+//       SS
+//     F    F
+//   F   F   F
+// F   F   F   F
+//
+// After which the SS will be hashed to obtain the final root hash
+func (r *ReferenceHasher) moveDanglingChunk() {
+
+	// calculate the total number of levels needed to represent the data (including the data level)
+	targetLevel := getLevelsFromLength(r.length, r.params.SectionSize, r.params.Branches)
+
+	// sum every intermediate level and write to the level above it
+	for i := 1; i < targetLevel; i++ {
+
+		// and if there is a single reference outside a balanced tree on this level
+		// don't hash it again but pass it on to the next level
+		if r.counts[i] > 0 {
+			// TODO: simplify if possible
+			if r.counts[i-1]-r.params.Spans[targetLevel-1-i] <= 1 {
+				r.cursors[i+1] = r.cursors[i]
+				r.cursors[i] = r.cursors[i-1]
+				continue
+			}
+		}
+
+		ref := r.sum(i)
+		copy(r.buffer[r.cursors[i+1]:], ref)
+		r.cursors[i+1] += len(ref)
+		r.cursors[i] = r.cursors[i+1]
+	}
+}
diff --git a/file/hasher/reference_test.go b/file/hasher/reference_test.go
new file mode 100644
index 0000000000..d4deef5c0b
--- /dev/null
+++ b/file/hasher/reference_test.go
@@ -0,0 +1,140 @@
+package hasher
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/ethereum/go-ethereum/common/hexutil"
+	"github.com/ethersphere/swarm/bmt"
+	"github.com/ethersphere/swarm/file"
+	"github.com/ethersphere/swarm/log"
+	"github.com/ethersphere/swarm/testutil"
+	"golang.org/x/crypto/sha3"
+)
+
+// TestManualDanglingChunk is a test script explicitly hashing and writing every individual level in the dangling chunk edge case
+// we use a balanced tree with data size of chunkSize*branches, and a single chunk of data
+// this case is chosen because it produces the wrong result in the pyramid hasher at the time of writing (master commit hash 4928d989ebd0854d993c10c194e61a5a5455e4f9)
+func TestManualDanglingChunk(t *testing.T) {
+	pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
+	h := bmt.New(pool)
+
+	// to execute the job we need buffers with the following capacities:
+	// level 0: chunkSize*branches+chunkSize
+	// level 1: chunkSize
+	// level 2: sectionSize * 2
+	var levels [][]byte
+	levels = append(levels, nil)
+	levels = append(levels, make([]byte, chunkSize))
+	levels = append(levels, make([]byte, sectionSize*2))
+
+	// hash the balanced tree portion of the data level and write to level 1
+	_, levels[0] = testutil.SerialData(chunkSize*branches+chunkSize, 255, 0)
+	for i := 0; i < chunkSize*branches; i += chunkSize {
+		h.Reset()
+		h.SetSpan(chunkSize)
+		h.Write(levels[0][i : i+chunkSize])
+		copy(levels[1][i/branches:], h.Sum(nil))
+	}
+	refHex := hexutil.Encode(levels[1][:sectionSize])
+	correctRefHex := "0xc10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef"
+	if refHex != correctRefHex {
+		t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex)
+	}
+
+	// write the dangling chunk
+	// hash it and write the reference on the second section of level 2
+	h.Reset()
+	h.SetSpan(chunkSize)
+	h.Write(levels[0][chunkSize*branches:])
+	copy(levels[2][sectionSize:], h.Sum(nil))
+	refHex = hexutil.Encode(levels[2][sectionSize:])
+	correctRefHex = "0x81b31d9a7f6c377523e8769db021091df23edd9fd7bd6bcdf11a22f518db6006"
+	if refHex != correctRefHex {
+		t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex)
+	}
+
+	// hash the chunk on level 1 and write into the first section of level 2
+	h.Reset()
+	h.SetSpan(chunkSize * branches)
+	h.Write(levels[1])
+	copy(levels[2], h.Sum(nil))
+	refHex = hexutil.Encode(levels[2][:sectionSize])
+	correctRefHex = "0x3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09"
+	if refHex != correctRefHex {
+		t.Fatalf("manual dangling balanced tree; expected %s, got %s", correctRefHex, refHex)
+	}
+
+	// hash the two sections on level 2 to obtain the root hash
+	h.Reset()
+	h.SetSpan(chunkSize*branches + chunkSize)
+	h.Write(levels[2])
+	ref := h.Sum(nil)
+	refHex = hexutil.Encode(ref)
+	correctRefHex = "0xb8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199"
+	if refHex != correctRefHex {
+		t.Fatalf("manual dangling root; expected %s, got %s", correctRefHex, refHex)
+	}
+}
+
+// TestReferenceFileHasherVector executes the file hasher algorithms on serial input data of periods of 0-254
+// of lengths defined in common_test.go
+//
+// the "expected" array in common_test.go is generated by this implementation, and test failure due to
+// result mismatch is nothing else than an indication that something has changed in the reference filehasher
+// or the underlying hashing algorithm
+func TestReferenceHasherVector(t *testing.T) {
+
+	hashFunc := func(_ context.Context) file.SectionWriter {
+		pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
+		return bmt.New(pool)
+	}
+	params := newTreeParams(hashFunc)
+	var mismatch int
+	for i := start; i < end; i++ {
+		dataLength := dataLengths[i]
+		log.Info("start", "i", i, "len", dataLength)
+		rh := NewReferenceHasher(params)
+		_, data := testutil.SerialData(dataLength, 255, 0)
+		refHash := rh.Hash(data)
+		eq := true
+		if expected[i] != fmt.Sprintf("%x", refHash) {
+			mismatch++
+			eq = false
+		}
+		t.Logf("[%7d+%4d]\t%v\tref: %x\texpect: %s", dataLength/chunkSize, dataLength%chunkSize, eq, refHash, expected[i])
+	}
+	if mismatch > 0 {
+		t.Fatalf("mismatches: %d/%d", mismatch, end-start)
+	}
+}
+
+// BenchmarkReferenceHasher establishes a baseline for a fully synchronous file hashing operation
+// it will be vastly inefficient
+func BenchmarkReferenceHasher(b *testing.B) {
+	for i := start; i < end; i++ {
+		b.Run(fmt.Sprintf("%d", dataLengths[i]), benchmarkReferenceHasher)
+	}
+}
+
+func benchmarkReferenceHasher(b *testing.B) {
+	benchParams := strings.Split(b.Name(), "/")
+	dataLength, err := strconv.ParseInt(benchParams[1], 10, 64)
+	if err != nil {
+		b.Fatal(err)
+	}
+	hashFunc := func(_ context.Context) file.SectionWriter {
+		pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
+		return bmt.New(pool)
+	}
+	params := newTreeParams(hashFunc)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, data := testutil.SerialData(int(dataLength), 255, 0)
+		fh := NewReferenceHasher(params)
+		fh.Hash(data)
+	}
+}
diff --git a/file/hasher/util.go b/file/hasher/util.go
new file mode 100644
index 0000000000..141fd1d114
--- /dev/null
+++ b/file/hasher/util.go
@@ -0,0 +1,31 @@
+package hasher
+
+import (
+	"math"
+)
+
+// TODO: level 0 should be SectionSize() not Branches()
+// generates a dictionary of maximum span lengths per level represented by one SectionSize() of data
+func generateSpanSizes(branches int, levels int) []int {
+	spans := make([]int, levels)
+	span := 1
+	for i := 0; i < 9; i++ {
+		spans[i] = span
+		span *= branches
+	}
+	return spans
+}
+
+// TODO: use params instead of sectionSize, branches
+// calculate the last level index which a particular data section count will result in.
+// the returned level will be the level of the root hash
+func getLevelsFromLength(l int, sectionSize int, branches int) int {
+	if l == 0 {
+		return 0
+	} else if l <= sectionSize*branches {
+		return 1
+	}
+	c := (l - 1) / (sectionSize)
+
+	return int(math.Log(float64(c))/math.Log(float64(branches)) + 1)
+}
diff --git a/file/hasher/util_test.go b/file/hasher/util_test.go
new file mode 100644
index 0000000000..51640e4ad5
--- /dev/null
+++ b/file/hasher/util_test.go
@@ -0,0 +1,17 @@
+package hasher
+
+import "testing"
+
+// TestLevelsFromLength verifies getLevelsFromLength
+func TestLevelsFromLength(t *testing.T) {
+
+	sizes := []int{sectionSize, chunkSize, chunkSize + sectionSize, chunkSize * branches, chunkSize*branches + 1}
+	expects := []int{1, 1, 2, 2, 3}
+
+	for i, size := range sizes {
+		lvl := getLevelsFromLength(size, sectionSize, branches)
+		if expects[i] != lvl {
+			t.Fatalf("size %d, expected %d, got %d", size, expects[i], lvl)
+		}
+	}
+}
diff --git a/testutil/data.go b/testutil/data.go
new file mode 100644
index 0000000000..f3bea59e91
--- /dev/null
+++ b/testutil/data.go
@@ -0,0 +1,15 @@
+package testutil
+
+import (
+	"bytes"
+	"io"
+)
+
+func SerialData(l int, mod int, offset int) (r io.Reader, slice []byte) {
+	slice = make([]byte, l)
+	for i := 0; i < len(slice); i++ {
+		slice[i] = byte((i + offset) % mod)
+	}
+	r = io.LimitReader(bytes.NewReader(slice), int64(l))
+	return
+}