Skip to content

Commit

Permalink
adding uint8 experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
dav009 committed Jan 4, 2018
1 parent ec4a12a commit 28d94d1
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 0 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,16 @@ Corpus Data Structure Used Memory Accuracy
| Corpus | Data Structure | Used Memory | Accuracy |
|---------|-----------------|-----------------|-----------|
| Half of Wiki corpus (English) | Abacus (1000MB) | 1.75GB | 96% |
| Half of Wiki corpus (English) | Abacus (Log8) (200MB) | 369MB | 70% |
| Half of Wiki corpus (English) | Abacus (Log8) (400MB) | 407MB | 98% |
| Half of Wiki corpus (English) | Map | 3.3GB | 100% |

| Corpus | Data Structure | Used Memory | Accuracy |
|---------|-----------------|-----------------|-----------|
| Complete Wiki corpus (English) | Abacus (2200MB) | 3.63GB | 98% |
| Complete Wiki corpus (English) | Abacus (500MB) | 741MB | 15% |
| Complete Wiki corpus (English) | Abacus (Log8) (500MB) | 760MB | 90% |
| Complete Wiki corpus (English) | Abacus (Log8) (700MB) | 889MB | 97% |
| Complete Wiki corpus (English) | Map | 10.46GB | 100% |

Note: This is me playing with Golang again, heavily based on [Bounter](https://github.com/RaRe-Technologies/bounter)
Expand All @@ -52,5 +59,6 @@ Used to count item frequencies.

Used to calculate the cardinality

-----------

Icon made by [free-icon](https://www.flaticon.com/free-icon/)
45 changes: 45 additions & 0 deletions abacus.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,48 @@ func New(maxMemoryMB uint) memoryAbacus {
a:= memoryAbacus{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)}
return a
}


type memoryAbacusLog8 struct{
MaxMemorySize uint
s *SketchLog8
h hll.HLL
total *big.Int
}

func (a *memoryAbacusLog8) Counts(key string) (CountType, error) {
return CountType(a.s.Query([]byte(key))), nil
}

func (a *memoryAbacusLog8) Update(items []string) error {
for _, key := range items {
a.s.Incr([]byte(key))
a.h.Add(uint64(murmur3.Sum32([]byte(key))))
a.total = a.total.Add(big.NewInt(1), a.total)
}

return nil
}

func (a *memoryAbacusLog8) Total() (*big.Int, error){
return a.total, nil
}

func (a *memoryAbacusLog8) Cardinality() (CountType, error){
return CountType(a.h.EstimateCardinality()),nil
}

func widthAndDepthFromSizeLog8(sizeMB uint) (uint32, uint32){
width := uint64(uint64(sizeMB*1000000) / uint64( 2 * 8 * sizeOfCellLog8() ))
depth :=(uint64(sizeMB)*1000000) / (width * uint64(sizeOfCellLog8()))
return uint32(width), uint32(depth)
}

func NewAbacus8Log(maxMemoryMB uint) memoryAbacusLog8 {
w, d := widthAndDepthFromSizeLog8(maxMemoryMB)
sketch := NewSketchLog8(w, d)
s, _ := hll.SizeByP(16)
h := make(hll.HLL, s)
a:= memoryAbacusLog8{ MaxMemorySize: maxMemoryMB, s:sketch, h:h, total: big.NewInt(0)}
return a
}
89 changes: 89 additions & 0 deletions minsketch8.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package abacus

import "sync"
import (
"github.com/spaolacci/murmur3"
"unsafe"
)

type CountTypeLog8 uint8
const MaxLog8 = ^(CountTypeLog8(0))

type SketchLog8 struct {
Width uint32
Depth uint32
Count [][]CountTypeLog8
mutex sync.RWMutex
}

func sizeOfCellLog8() uintptr{
var a CountTypeLog8
return unsafe.Sizeof(a)
}

func NewSketchLog8(width, depth uint32) (sk *SketchLog8) {
sk = &SketchLog8{
Width: width,
Depth: depth,
Count: make([][]CountTypeLog8, depth),
}
for i := uint32(0); i < depth; i++ {
sk.Count[i] = make([]CountTypeLog8, width)
}
return sk
}

func (sk *SketchLog8) Incr(dat []byte) (min CountTypeLog8) {
return sk.Add(dat, 1)
}

func (sk *SketchLog8) positions(dat []byte) (pos []uint32) {
// reference: https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/membership/Filter.java
hash1 := murmur3.Sum32WithSeed(dat, 0)
hash2 := murmur3.Sum32WithSeed(dat, hash1)
pos = make([]uint32, sk.Depth)
for i := uint32(0); i < sk.Depth; i++ {
pos[i] = (hash1 + i*hash2) % sk.Width
}
return pos
}

func (sk *SketchLog8) Add(dat []byte, cnt CountTypeLog8) (min CountTypeLog8) {
pos := sk.positions(dat)
min = sk.query(pos)

min += cnt

sk.mutex.Lock()
for i := uint32(0); i < sk.Depth; i++ {
v := sk.Count[i][pos[i]]
if v < min {
sk.Count[i][pos[i]] = min
}
}
sk.mutex.Unlock()

return min
}

func (sk *SketchLog8) Query(dat []byte) (min CountTypeLog8) {
pos := sk.positions(dat)
return sk.query(pos)
}

func (sk *SketchLog8) query(pos []uint32) (min CountTypeLog8) {
min = MaxLog8

sk.mutex.RLock()
for i := uint32(0); i < sk.Depth; i++ {
v := sk.Count[i][pos[i]]
if min > v {
min = v
}
}
sk.mutex.RUnlock()

return min
}


0 comments on commit 28d94d1

Please sign in to comment.