From 28d94d14be2631b2f2bc35bd34d69587d1cc622e Mon Sep 17 00:00:00 2001 From: David Przybilla Date: Thu, 4 Jan 2018 15:51:57 +0900 Subject: [PATCH] adding uint8 experiment --- README.md | 8 +++++ abacus.go | 45 ++++++++++++++++++++++++++ minsketch8.go | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+) create mode 100644 minsketch8.go diff --git a/README.md b/README.md index 909b99b..d6ddb71 100755 --- a/README.md +++ b/README.md @@ -32,9 +32,16 @@ Corpus Data Structure Used Memory Accuracy | Corpus | Data Structure | Used Memory | Accuracy | |---------|-----------------|-----------------|-----------| | Half of Wiki corpus (English) | Abacus (1000MB) | 1.75GB | 96% | +| Half of Wiki corpus (English) | Abacus (Log8) (200MB) | 369MB | 70% | +| Half of Wiki corpus (English) | Abacus (Log8) (400MB) | 407MB | 98% | | Half of Wiki corpus (English) | Map | 3.3GB | 100% | + +| Corpus | Data Structure | Used Memory | Accuracy | +|---------|-----------------|-----------------|-----------| | Complete Wiki corpus (English) | Abacus (2200MB) | 3.63GB | 98% | | Complete Wiki corpus (English) | Abacus (500MB) | 741MB | 15% | +| Complete Wiki corpus (English) | Abacus (Log8) (500MB) | 760MB | 90% | +| Complete Wiki corpus (English) | Abacus (Log8) (700MB) | 889MB | 97% | | Complete Wiki corpus (English) | Map | 10.46GB | 100% | Note: This is me playing with Golang again, heavily based on [Bounter](https://github.com/RaRe-Technologies/bounter) @@ -52,5 +59,6 @@ Used to count item frequencies. Used to calculate the cardinality +----------- Icon made by [free-icon](https://www.flaticon.com/free-icon/) diff --git a/abacus.go b/abacus.go index 8bdffdb..b72c99c 100644 --- a/abacus.go +++ b/abacus.go @@ -56,3 +56,48 @@ func New(maxMemoryMB uint) memoryAbacus { a:= memoryAbacus{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)} return a } + + +type memoryAbacusLog8 struct{ + MaxMemorySize uint + s *SketchLog8 + h hll.HLL + total *big.Int +} + +func (a *memoryAbacusLog8) Counts(key string) (CountType, error) { + return CountType(a.s.Query([]byte(key))), nil +} + +func (a *memoryAbacusLog8) Update(items []string) error { + for _, key := range items { + a.s.Incr([]byte(key)) + a.h.Add(uint64(murmur3.Sum32([]byte(key)))) + a.total = a.total.Add(big.NewInt(1), a.total) + } + + return nil +} + +func (a *memoryAbacusLog8) Total() (*big.Int, error){ + return a.total, nil +} + +func (a *memoryAbacusLog8) Cardinality() (CountType, error){ + return CountType(a.h.EstimateCardinality()),nil +} + +func widthAndDepthFromSizeLog8(sizeMB uint) (uint32, uint32){ + width := uint64(uint64(sizeMB*1000000) / uint64( 2 * 8 * sizeOfCellLog8() )) + depth :=(uint64(sizeMB)*1000000) / (width * uint64(sizeOfCellLog8())) + return uint32(width), uint32(depth) +} + +func NewAbacus8Log(maxMemoryMB uint) memoryAbacusLog8 { + w, d := widthAndDepthFromSizeLog8(maxMemoryMB) + sketch := NewSketchLog8(w, d) + s, _ := hll.SizeByP(16) + h := make(hll.HLL, s) + a:= memoryAbacusLog8{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)} + return a +} diff --git a/minsketch8.go b/minsketch8.go new file mode 100644 index 0000000..7679f7f --- /dev/null +++ b/minsketch8.go @@ -0,0 +1,89 @@ +package abacus + +import "sync" +import ( + "github.com/spaolacci/murmur3" + "unsafe" +) + +type CountTypeLog8 uint8 +const MaxLog8 = ^(CountTypeLog8(0)) + +type SketchLog8 struct { + Width uint32 + Depth uint32 + Count [][]CountTypeLog8 + mutex sync.RWMutex +} + +func sizeOfCellLog8() uintptr{ + var a CountTypeLog8 + return unsafe.Sizeof(a) +} + +func NewSketchLog8(width, depth uint32) (sk *SketchLog8) { + sk = &SketchLog8{ + Width: width, + Depth: depth, + Count: make([][]CountTypeLog8, depth), + } + for i := uint32(0); i < depth; i++ { + sk.Count[i] = make([]CountTypeLog8, width) + } + return sk +} + +func (sk *SketchLog8) Incr(dat []byte) (min CountTypeLog8) { + return sk.Add(dat, 1) +} + +func (sk *SketchLog8) positions(dat []byte) (pos []uint32) { + // reference: https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/membership/Filter.java + hash1 := murmur3.Sum32WithSeed(dat, 0) + hash2 := murmur3.Sum32WithSeed(dat, hash1) + pos = make([]uint32, sk.Depth) + for i := uint32(0); i < sk.Depth; i++ { + pos[i] = (hash1 + i*hash2) % sk.Width + } + return pos +} + +func (sk *SketchLog8) Add(dat []byte, cnt CountTypeLog8) (min CountTypeLog8) { + pos := sk.positions(dat) + min = sk.query(pos) + + min += cnt + + sk.mutex.Lock() + for i := uint32(0); i < sk.Depth; i++ { + v := sk.Count[i][pos[i]] + if v < min { + sk.Count[i][pos[i]] = min + } + } + sk.mutex.Unlock() + + return min +} + +func (sk *SketchLog8) Query(dat []byte) (min CountTypeLog8) { + pos := sk.positions(dat) + return sk.query(pos) +} + +func (sk *SketchLog8) query(pos []uint32) (min CountTypeLog8) { + min = MaxLog8 + + sk.mutex.RLock() + for i := uint32(0); i < sk.Depth; i++ { + v := sk.Count[i][pos[i]] + if min > v { + min = v + } + } + sk.mutex.RUnlock() + + return min +} + +