file, testutil: Add reference file hasher (#2099)

ethersphere · Feb 24, 2020 · 15fbb9d · 15fbb9d
1 parent 4f23c06
commit 15fbb9d
Show file tree

Hide file tree

Showing 9 changed files with 472 additions and 2 deletions.
diff --git a/file/hasher/common_test.go b/file/hasher/common_test.go
@@ -0,0 +1,66 @@
+package hasher
+
+import (
+	"github.com/ethersphere/swarm/testutil"
+)
+
+const (
+	sectionSize = 32
+	branches    = 128
+	chunkSize   = 4096
+)
+
+var (
+	dataLengths = []int{31, // 0
+		32,                     // 1
+		33,                     // 2
+		63,                     // 3
+		64,                     // 4
+		65,                     // 5
+		chunkSize,              // 6
+		chunkSize + 31,         // 7
+		chunkSize + 32,         // 8
+		chunkSize + 63,         // 9
+		chunkSize + 64,         // 10
+		chunkSize * 2,          // 11
+		chunkSize*2 + 32,       // 12
+		chunkSize * 128,        // 13
+		chunkSize*128 + 31,     // 14
+		chunkSize*128 + 32,     // 15
+		chunkSize*128 + 64,     // 16
+		chunkSize * 129,        // 17
+		chunkSize * 130,        // 18
+		chunkSize * 128 * 128,  // 19
+		chunkSize*128*128 + 32, // 20
+	}
+	expected = []string{
+		"ece86edb20669cc60d142789d464d57bdf5e33cb789d443f608cbd81cfa5697d", // 0
+		"0be77f0bb7abc9cd0abed640ee29849a3072ccfd1020019fe03658c38f087e02", // 1
+		"3463b46d4f9d5bfcbf9a23224d635e51896c1daef7d225b86679db17c5fd868e", // 2
+		"95510c2ff18276ed94be2160aed4e69c9116573b6f69faaeed1b426fea6a3db8", // 3
+		"490072cc55b8ad381335ff882ac51303cc069cbcb8d8d3f7aa152d9c617829fe", // 4
+		"541552bae05e9a63a6cb561f69edf36ffe073e441667dbf7a0e9a3864bb744ea", // 5
+		"c10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef", // 6
+		"91699c83ed93a1f87e326a29ccd8cc775323f9e7260035a5f014c975c5f3cd28", // 7
+		"73759673a52c1f1707cbb61337645f4fcbd209cdc53d7e2cedaaa9f44df61285", // 8
+		"db1313a727ffc184ae52a70012fbbf7235f551b9f2d2da04bf476abe42a3cb42", // 9
+		"ade7af36ac0c7297dc1c11fd7b46981b629c6077bce75300f85b02a6153f161b", // 10
+		"29a5fb121ce96194ba8b7b823a1f9c6af87e1791f824940a53b5a7efe3f790d9", // 11
+		"61416726988f77b874435bdd89a419edc3861111884fd60e8adf54e2f299efd6", // 12
+		"3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09", // 13
+		"e5c76afa931e33ac94bce2e754b1bb6407d07f738f67856783d93934ca8fc576", // 14
+		"485a526fc74c8a344c43a4545a5987d17af9ab401c0ef1ef63aefcc5c2c086df", // 15
+		"624b2abb7aefc0978f891b2a56b665513480e5dc195b4a66cd8def074a6d2e94", // 16
+		"b8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199", // 17
+		"59de730bf6c67a941f3b2ffa2f920acfaa1713695ad5deea12b4a121e5f23fa1", // 18
+		"522194562123473dcfd7a457b18ee7dee8b7db70ed3cfa2b73f348a992fdfd3b", // 19
+		"ed0cc44c93b14fef2d91ab3a3674eeb6352a42ac2f0bbe524711824aae1e7bcc", // 20
+	}
+
+	start = 0
+	end   = len(dataLengths)
+)
+
+func init() {
+	testutil.Init()
+}
diff --git a/file/hasher/hasher.go b/file/hasher/hasher.go
@@ -14,7 +14,7 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the Swarm library. If not, see <http://www.gnu.org/licenses/>.
 
-package file
+package hasher
 
 import (
 	"context"

diff --git a/file/hasher/hasher_test.go b/file/hasher/hasher_test.go
@@ -14,7 +14,7 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the Swarm library. If not, see <http://www.gnu.org/licenses/>.
 
-package file
+package hasher
 
 import (
 	"bytes"

diff --git a/file/hasher/param.go b/file/hasher/param.go
@@ -0,0 +1,56 @@
+package hasher
+
+import (
+	"context"
+	"sync"
+
+	"github.com/ethersphere/swarm/file"
+)
+
+// defines the boundaries of the hashing job and also contains the hash factory function of the job
+// setting Debug means omitting any automatic behavior (for now it means job processing won't auto-start)
+type treeParams struct {
+	SectionSize int
+	Branches    int
+	ChunkSize   int
+	Spans       []int
+	Debug       bool
+	hashFunc    file.SectionWriterFunc
+	writerPool  sync.Pool
+	ctx         context.Context
+}
+
+func newTreeParams(hashFunc file.SectionWriterFunc) *treeParams {
+
+	h := hashFunc(context.Background())
+	p := &treeParams{
+		SectionSize: h.SectionSize(),
+		Branches:    h.Branches(),
+		ChunkSize:   h.SectionSize() * h.Branches(),
+		hashFunc:    hashFunc,
+	}
+	h.Reset()
+	p.writerPool.New = func() interface{} {
+		hf := p.hashFunc(p.ctx)
+		return hf
+	}
+	p.Spans = generateSpanSizes(p.Branches, 9)
+	return p
+}
+
+func (p *treeParams) SetContext(ctx context.Context) {
+	p.ctx = ctx
+}
+
+func (p *treeParams) GetContext() context.Context {
+	return p.ctx
+}
+
+func (p *treeParams) PutWriter(w file.SectionWriter) {
+	w.Reset()
+	p.writerPool.Put(w)
+}
+
+func (p *treeParams) GetWriter() file.SectionWriter {
+	return p.writerPool.Get().(file.SectionWriter)
+}
diff --git a/file/hasher/reference.go b/file/hasher/reference.go
@@ -0,0 +1,145 @@
+package hasher
+
+import (
+	"github.com/ethersphere/swarm/file"
+)
+
+// ReferenceHasher is the source-of-truth implementation of the swarm file hashing algorithm
+type ReferenceHasher struct {
+	params  *treeParams
+	cursors []int              // section write position, indexed per level
+	length  int                // number of bytes written to the data level of the hasher
+	buffer  []byte             // keeps data and hashes, indexed by cursors
+	counts  []int              // number of sums performed, indexed per level
+	hasher  file.SectionWriter // underlying hasher
+}
+
+// NewReferenceHasher constructs and returns a new ReferenceHasher
+// This implementation is limited to a tree of 9 levels, where level 0 is the data level
+// With 32 section size and 128 branches (i.e. unencrypted, non erasure-coded content) this means
+// a capacity of 4096 bytes * (128^(9-1)) ~ 295.148 * (10^18) bytes
+func NewReferenceHasher(params *treeParams) *ReferenceHasher {
+	// TODO: remove when bmt interface is amended
+	h := params.GetWriter()
+	return &ReferenceHasher{
+		params:  params,
+		cursors: make([]int, 9),
+		counts:  make([]int, 9),
+		buffer:  make([]byte, params.ChunkSize*9),
+		hasher:  h,
+	}
+}
+
+// Hash computes and returns the root hash of arbitrary data
+func (r *ReferenceHasher) Hash(data []byte) []byte {
+	l := r.params.ChunkSize
+	for i := 0; i < len(data); i += r.params.ChunkSize {
+		if len(data)-i < r.params.ChunkSize {
+			l = len(data) - i
+		}
+		r.update(0, data[i:i+l])
+	}
+
+	// if we didn't end on a chunk boundary we need to hash remaining chunks first
+	r.hashUnfinished()
+
+	// if the already hashed parts tree is balanced
+	r.moveDanglingChunk()
+
+	return r.digest()
+}
+
+// write to the data buffer on the specified level
+// calls sum if chunk boundary is reached and recursively calls this function for the next level with the acquired bmt hash
+// adjusts cursors accordingly
+func (r *ReferenceHasher) update(lvl int, data []byte) {
+	if lvl == 0 {
+		r.length += len(data)
+	}
+	copy(r.buffer[r.cursors[lvl]:r.cursors[lvl]+len(data)], data)
+	r.cursors[lvl] += len(data)
+	if r.cursors[lvl]-r.cursors[lvl+1] == r.params.ChunkSize {
+		ref := r.sum(lvl)
+		r.update(lvl+1, ref)
+		r.cursors[lvl] = r.cursors[lvl+1]
+	}
+}
+
+// calculates and returns the bmt sum of the last written data on the level
+func (r *ReferenceHasher) sum(lvl int) []byte {
+	r.counts[lvl]++
+	spanSize := r.params.Spans[lvl] * r.params.ChunkSize
+	span := (r.length-1)%spanSize + 1
+
+	sizeToSum := r.cursors[lvl] - r.cursors[lvl+1]
+
+	r.hasher.Reset()
+	r.hasher.SetSpan(span)
+	r.hasher.Write(r.buffer[r.cursors[lvl+1] : r.cursors[lvl+1]+sizeToSum])
+	ref := r.hasher.Sum(nil)
+	return ref
+}
+
+// called after all data has been written
+// sums the final chunks of each level
+// skips intermediate levels that end on span boundary
+func (r *ReferenceHasher) digest() []byte {
+
+	// the first section of the buffer will hold the root hash
+	return r.buffer[:r.params.SectionSize]
+}
+
+// hashes the remaining unhashed chunks at the end of each level
+func (r *ReferenceHasher) hashUnfinished() {
+	if r.length%r.params.ChunkSize != 0 {
+		ref := r.sum(0)
+		copy(r.buffer[r.cursors[1]:], ref)
+		r.cursors[1] += len(ref)
+		r.cursors[0] = r.cursors[1]
+	}
+}
+
+// in case of a balanced tree this method concatenates the reference to the single reference
+// at the highest level of the tree.
+//
+// Let F be full chunks (disregarding branching factor) and S be single references
+// in the following scenario:
+//
+//       S
+//     F   F
+//   F   F   F
+// F   F   F   F S
+//
+// The result will be:
+//
+//       SS
+//     F    F
+//   F   F   F
+// F   F   F   F
+//
+// After which the SS will be hashed to obtain the final root hash
+func (r *ReferenceHasher) moveDanglingChunk() {
+
+	// calculate the total number of levels needed to represent the data (including the data level)
+	targetLevel := getLevelsFromLength(r.length, r.params.SectionSize, r.params.Branches)
+
+	// sum every intermediate level and write to the level above it
+	for i := 1; i < targetLevel; i++ {
+
+		// and if there is a single reference outside a balanced tree on this level
+		// don't hash it again but pass it on to the next level
+		if r.counts[i] > 0 {
+			// TODO: simplify if possible
+			if r.counts[i-1]-r.params.Spans[targetLevel-1-i] <= 1 {
+				r.cursors[i+1] = r.cursors[i]
+				r.cursors[i] = r.cursors[i-1]
+				continue
+			}
+		}
+
+		ref := r.sum(i)
+		copy(r.buffer[r.cursors[i+1]:], ref)
+		r.cursors[i+1] += len(ref)
+		r.cursors[i] = r.cursors[i+1]
+	}
+}