diff --git a/internal/byte_input.go b/internal/byte_input.go index 3e5490a9..ed305f46 100644 --- a/internal/byte_input.go +++ b/internal/byte_input.go @@ -42,6 +42,25 @@ type ByteBuffer struct { off int } +// NewByteBuffer creates a new ByteBuffer. +func NewByteBuffer(buf []byte) *ByteBuffer { + return &ByteBuffer{ + buf: buf, + } +} + +var _ io.Reader = (*ByteBuffer)(nil) + +// Read implements io.Reader. +func (b *ByteBuffer) Read(p []byte) (int, error) { + data, err := b.Next(len(p)) + if err != nil { + return 0, err + } + copy(p, data) + return len(data), nil +} + // Next returns a slice containing the next n bytes from the reader // If there are fewer bytes than the given n, io.ErrUnexpectedEOF will be returned func (b *ByteBuffer) Next(n int) ([]byte, error) { @@ -109,26 +128,39 @@ func (b *ByteBuffer) Reset(buf []byte) { type ByteInputAdapter struct { r io.Reader readBytes int + buf [4]byte +} + +var _ io.Reader = (*ByteInputAdapter)(nil) + +// Read implements io.Reader. +func (b *ByteInputAdapter) Read(buf []byte) (int, error) { + m, err := io.ReadAtLeast(b.r, buf, len(buf)) + b.readBytes += m + + if err != nil { + return 0, err + } + + return m, nil } // Next returns a slice containing the next n bytes from the buffer, // advancing the buffer as if the bytes had been returned by Read. func (b *ByteInputAdapter) Next(n int) ([]byte, error) { buf := make([]byte, n) - m, err := io.ReadAtLeast(b.r, buf, n) - b.readBytes += m + _, err := b.Read(buf) if err != nil { return nil, err } - return buf, nil } // ReadUInt32 reads uint32 with LittleEndian order func (b *ByteInputAdapter) ReadUInt32() (uint32, error) { - buf, err := b.Next(4) - + buf := b.buf[:4] + _, err := b.Read(buf) if err != nil { return 0, err } @@ -138,8 +170,8 @@ func (b *ByteInputAdapter) ReadUInt32() (uint32, error) { // ReadUInt16 reads uint16 with LittleEndian order func (b *ByteInputAdapter) ReadUInt16() (uint16, error) { - buf, err := b.Next(2) - + buf := b.buf[:2] + _, err := b.Read(buf) if err != nil { return 0, err } diff --git a/roaring.go b/roaring.go index 7220da27..ea3ec3fc 100644 --- a/roaring.go +++ b/roaring.go @@ -63,7 +63,7 @@ func (rb *Bitmap) ToBytes() ([]byte, error) { func (rb *Bitmap) Checksum() uint64 { const ( offset = 14695981039346656037 - prime = 1099511628211 + prime = 1099511628211 ) var bytes []byte @@ -106,6 +106,14 @@ func (rb *Bitmap) Checksum() uint64 { return hash } +// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy. +// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap. +// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually. +func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) { + stream := internal.NewByteBuffer(data) + return rb.ReadFrom(stream) +} + // ReadFrom reads a serialized version of this bitmap from stream. // The format is compatible with other RoaringBitmap // implementations (Java, C) and is documented here: @@ -114,12 +122,18 @@ func (rb *Bitmap) Checksum() uint64 { // So add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom. // It is not necessary to pass cookieHeader when call roaring.ReadFrom to read the roaring32 data directly. func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) { - stream := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter) - stream.Reset(reader) + stream, ok := reader.(internal.ByteInput) + if !ok { + byteInputAdapter := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter) + byteInputAdapter.Reset(reader) + stream = byteInputAdapter + } p, err = rb.highlowcontainer.readFrom(stream, cookieHeader...) - internal.ByteInputAdapterPool.Put(stream) + if !ok { + internal.ByteInputAdapterPool.Put(stream.(*internal.ByteInputAdapter)) + } return } @@ -144,7 +158,6 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err // bitmap derived from this bitmap (e.g., via Or, And) might // also be broken. Thus, before making buf unavailable, you should // call CloneCopyOnWriteContainers on all such bitmaps. -// func (rb *Bitmap) FromBuffer(buf []byte) (p int64, err error) { stream := internal.ByteBufferPool.Get().(*internal.ByteBuffer) stream.Reset(buf) @@ -276,9 +289,9 @@ type intIterator struct { // This way, instead of making up-to 64k allocations per full iteration // we get a single allocation and simply reinitialize the appropriate // iterator and point to it in the generic `iter` member on each key bound. - shortIter shortIterator - runIter runIterator16 - bitmapIter bitmapContainerShortIterator + shortIter shortIterator + runIter runIterator16 + bitmapIter bitmapContainerShortIterator } // HasNext returns true if there are more integers to iterate over @@ -341,7 +354,6 @@ func (ii *intIterator) AdvanceIfNeeded(minval uint32) { // IntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap) type IntIterator = intIterator - // Initialize configures the existing iterator so that it can iterate through the values of // the provided bitmap. // The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove). @@ -357,9 +369,9 @@ type intReverseIterator struct { iter shortIterable highlowcontainer *roaringArray - shortIter reverseIterator - runIter runReverseIterator16 - bitmapIter reverseBitmapContainerShortIterator + shortIter reverseIterator + runIter runReverseIterator16 + bitmapIter reverseBitmapContainerShortIterator } // HasNext returns true if there are more integers to iterate over @@ -434,9 +446,9 @@ type manyIntIterator struct { iter manyIterable highlowcontainer *roaringArray - shortIter shortIterator - runIter runIterator16 - bitmapIter bitmapContainerManyIterator + shortIter shortIterator + runIter runIterator16 + bitmapIter bitmapContainerManyIterator } func (ii *manyIntIterator) init() { @@ -495,7 +507,6 @@ func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int { return n } - // ManyIntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap) type ManyIntIterator = manyIntIterator @@ -569,7 +580,7 @@ func (rb *Bitmap) Iterate(cb func(x uint32) bool) { // Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order; // the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). func (rb *Bitmap) Iterator() IntPeekable { - p := new(intIterator) + p := new(intIterator) p.Initialize(rb) return p } diff --git a/roaring64/roaring64.go b/roaring64/roaring64.go index 688f84d8..e498b280 100644 --- a/roaring64/roaring64.go +++ b/roaring64/roaring64.go @@ -9,6 +9,7 @@ import ( "strconv" "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/internal" ) const serialCookieNoRunContainer = 12346 // only arrays and bitmaps @@ -61,7 +62,7 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) { } n += int64(written) pos := 0 - keyBuf := make([]byte, 4) + keyBuf := buf[:4] for pos < rb.highlowcontainer.size() { c := rb.highlowcontainer.getContainerAtIndex(pos) binary.LittleEndian.PutUint32(keyBuf, rb.highlowcontainer.getKeyAtIndex(pos)) @@ -80,6 +81,86 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) { return n, nil } +// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy. +// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap. +// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually. +func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) { + stream := internal.NewByteBuffer(data) + + cookie, r32, p, err := tryReadFromRoaring32ByteBuffer(rb, stream) + if err != nil { + return p, err + } else if r32 { + return p, nil + } + + var sizeBuf [8]byte // Avoid changing the original byte slice. + sizeBuf2, err := stream.Next(4) + if err != nil { + return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read number of containers: %w", err) + } + p += 4 + copy(sizeBuf[:], cookie) + copy(sizeBuf[4:], sizeBuf2) + size := binary.LittleEndian.Uint64(sizeBuf[:]) + + rb.highlowcontainer.resize(0) + if cap(rb.highlowcontainer.keys) >= int(size) { + rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size] + } else { + rb.highlowcontainer.keys = make([]uint32, size) + } + if cap(rb.highlowcontainer.containers) >= int(size) { + rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size] + } else { + rb.highlowcontainer.containers = make([]*roaring.Bitmap, size) + } + if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) { + rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size] + } else { + rb.highlowcontainer.needCopyOnWrite = make([]bool, size) + } + for i := uint64(0); i < size; i++ { + keyBuf, err := stream.Next(4) + if err != nil { + return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read key #%d: %w", i, err) + } + p += 4 + rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf) + rb.highlowcontainer.containers[i] = roaring.NewBitmap() + n, err := rb.highlowcontainer.containers[i].ReadFrom(stream) + if n == 0 || err != nil { + return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) + } + p += int64(n) + } + + return p, nil +} + +func tryReadFromRoaring32ByteBuffer(rb *Bitmap, stream *internal.ByteBuffer) (cookie []byte, r32 bool, p int64, err error) { + // Verify the first two bytes are a valid MagicNumber. + cookie, err = stream.Next(4) + if err != nil { + return cookie, false, 0, err + } + fileMagic := int(binary.LittleEndian.Uint16(cookie[0:2])) + if fileMagic == serialCookieNoRunContainer || fileMagic == serialCookie { + bm32 := roaring.NewBitmap() + p, err = bm32.ReadFrom(stream, cookie...) + if err != nil { + return + } + // Try reuse the underlying slices. + rb.highlowcontainer.resize(0) + rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, 0) + rb.highlowcontainer.containers = append(rb.highlowcontainer.containers, bm32) + rb.highlowcontainer.needCopyOnWrite = append(rb.highlowcontainer.needCopyOnWrite, false) + return cookie, true, p, nil + } + return +} + // ReadFrom reads a serialized version of this bitmap from stream. // The format is compatible with other 64-bit RoaringBitmap // implementations (Java, Go, C++) and it has a specification : @@ -103,11 +184,23 @@ func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) { sizeBuf = append(cookie, sizeBuf...) size := binary.LittleEndian.Uint64(sizeBuf) - rb.highlowcontainer = roaringArray64{} - rb.highlowcontainer.keys = make([]uint32, size) - rb.highlowcontainer.containers = make([]*roaring.Bitmap, size) - rb.highlowcontainer.needCopyOnWrite = make([]bool, size) - keyBuf := make([]byte, 4) + rb.highlowcontainer.resize(0) + if cap(rb.highlowcontainer.keys) >= int(size) { + rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size] + } else { + rb.highlowcontainer.keys = make([]uint32, size) + } + if cap(rb.highlowcontainer.containers) >= int(size) { + rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size] + } else { + rb.highlowcontainer.containers = make([]*roaring.Bitmap, size) + } + if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) { + rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size] + } else { + rb.highlowcontainer.needCopyOnWrite = make([]bool, size) + } + keyBuf := sizeBuf[:4] for i := uint64(0); i < size; i++ { n, err = stream.Read(keyBuf) if n == 0 || err != nil { @@ -140,11 +233,11 @@ func tryReadFromRoaring32(rb *Bitmap, stream io.Reader) (cookie []byte, r32 bool if err != nil { return } - rb.highlowcontainer = roaringArray64{ - keys: []uint32{0}, - containers: []*roaring.Bitmap{bm32}, - needCopyOnWrite: []bool{false}, - } + // Try reuse the underlying slices. + rb.highlowcontainer.resize(0) + rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, 0) + rb.highlowcontainer.containers = append(rb.highlowcontainer.containers, bm32) + rb.highlowcontainer.needCopyOnWrite = append(rb.highlowcontainer.needCopyOnWrite, false) return cookie, true, p, nil } return diff --git a/roaring64/roaringarray64.go b/roaring64/roaringarray64.go index 446c1884..26aabd72 100644 --- a/roaring64/roaringarray64.go +++ b/roaring64/roaringarray64.go @@ -124,6 +124,8 @@ func (ra *roaringArray64) removeIndexRange(begin, end int) { func (ra *roaringArray64) resize(newsize int) { for k := newsize; k < len(ra.containers); k++ { + ra.keys[k] = 0 + ra.needCopyOnWrite[k] = false ra.containers[k] = nil } diff --git a/roaring64/serialization_test.go b/roaring64/serialization_test.go index 06d21526..98964226 100644 --- a/roaring64/serialization_test.go +++ b/roaring64/serialization_test.go @@ -5,6 +5,7 @@ package roaring64 import ( "bytes" "fmt" + "io" "os" "runtime" "testing" @@ -21,12 +22,18 @@ func TestSerializationOfEmptyBitmap(t *testing.T) { require.NoError(t, err) assert.EqualValues(t, buf.Len(), rb.GetSerializedSizeInBytes()) + data := buf.Bytes() newrb := NewBitmap() _, err = newrb.ReadFrom(buf) require.NoError(t, err) assert.True(t, rb.Equals(newrb)) + + newrb2 := NewBitmap() + _, err = newrb2.FromUnsafeBytes(data) + require.NoError(t, err) + assert.True(t, rb.Equals(newrb2)) } func TestBase64_036(t *testing.T) { @@ -51,26 +58,32 @@ func TestSerializationBasic037(t *testing.T) { require.NoError(t, err) assert.EqualValues(t, buf.Len(), rb.GetSerializedSizeInBytes()) + data := buf.Bytes() newrb := NewBitmap() _, err = newrb.ReadFrom(buf) require.NoError(t, err) assert.True(t, rb.Equals(newrb)) + + newrb2 := NewBitmap() + _, err = newrb2.FromUnsafeBytes(data) + require.NoError(t, err) + assert.True(t, rb.Equals(newrb2)) } func TestSerializationToFile038(t *testing.T) { rb := BitmapOf(1, 2, 3, 4, 5, 100, 1000) fname := "myfile.bin" fout, err := os.OpenFile(fname, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0660) - if(err != nil) { + if err != nil { fmt.Fprintf(os.Stderr, "\n\nIMPORTANT: For testing file IO, the roaring library requires disk access.\nWe omit some tests for now.\n\n") return } var l int64 l, err = rb.WriteTo(fout) - if(err != nil) { + if err != nil { fmt.Fprintf(os.Stderr, "\n\nIMPORTANT: For testing file IO, the roaring library requires disk access.\nWe omit some tests for now.\n\n") return } @@ -81,18 +94,25 @@ func TestSerializationToFile038(t *testing.T) { newrb := NewBitmap() fin, err := os.Open(fname) - if(err != nil) { + if err != nil { fmt.Fprintf(os.Stderr, "\n\nIMPORTANT: For testing file IO, the roaring library requires disk access.\nWe omit some tests for now.\n\n") return } + buf := bytes.NewBuffer(nil) + teer := io.TeeReader(fin, buf) defer func() { fin.Close() _ = os.Remove(fname) }() - _, _ = newrb.ReadFrom(fin) + _, _ = newrb.ReadFrom(teer) assert.True(t, rb.Equals(newrb)) + + newrb2 := NewBitmap() + _, err = newrb2.FromUnsafeBytes(buf.Bytes()) + require.NoError(t, err) + assert.True(t, rb.Equals(newrb2)) } func TestSerializationBasic2_041(t *testing.T) { @@ -104,12 +124,18 @@ func TestSerializationBasic2_041(t *testing.T) { require.NoError(t, err) assert.Equal(t, l, buf.Len()) + data := buf.Bytes() newrb := NewBitmap() _, err = newrb.ReadFrom(buf) require.NoError(t, err) assert.True(t, rb.Equals(newrb)) + + newrb2 := NewBitmap() + _, err = newrb2.FromUnsafeBytes(data) + require.NoError(t, err) + assert.True(t, rb.Equals(newrb2)) } // roaringarray.writeTo and .readFrom should serialize and unserialize when containing all 3 container types @@ -124,12 +150,18 @@ func TestSerializationBasic3_042(t *testing.T) { require.NoError(t, err) assert.EqualValues(t, buf.Len(), int(rb.GetSerializedSizeInBytes())) + data := buf.Bytes() newrb := NewBitmap() _, err = newrb.ReadFrom(&buf) require.NoError(t, err) assert.True(t, newrb.Equals(rb)) + + newrb2 := NewBitmap() + _, err = newrb2.FromUnsafeBytes(data) + require.NoError(t, err) + assert.True(t, rb.Equals(newrb2)) } func TestHoldReference(t *testing.T) { @@ -172,7 +204,23 @@ func TestHoldReference(t *testing.T) { }) } +func BenchmarkUnserializeFromUnsafeBytes(b *testing.B) { + benchmarkUnserializeFunc(b, "FromUnsafeBytes", func(bitmap *Bitmap, data []byte) (int64, error) { + copied := make([]byte, len(data)) + copy(copied, data) + return bitmap.FromUnsafeBytes(copied) + }) +} + func BenchmarkUnserializeReadFrom(b *testing.B) { + benchmarkUnserializeFunc(b, "ReadFrom", func(bitmap *Bitmap, data []byte) (int64, error) { + return bitmap.ReadFrom(bytes.NewReader(data)) + }) +} + +func benchmarkUnserializeFunc(b *testing.B, name string, f func(*Bitmap, []byte) (int64, error)) { + b.Helper() + for _, size := range []uint64{650, 6500, 65000, 650000, 6500000} { rb := New() buf := &bytes.Buffer{} @@ -187,15 +235,14 @@ func BenchmarkUnserializeReadFrom(b *testing.B) { b.Fatalf("Unexpected error occurs: %v", err) } - b.Run(fmt.Sprintf("ReadFrom-%d", size), func(b *testing.B) { + b.Run(fmt.Sprintf("%s-%d", name, size), func(b *testing.B) { b.ReportAllocs() b.StartTimer() for n := 0; n < b.N; n++ { - reader := bytes.NewReader(buf.Bytes()) nb := New() - if _, err := nb.ReadFrom(reader); err != nil { + if _, err := f(nb, buf.Bytes()); err != nil { b.Fatalf("Unexpected error occurs: %v", err) } }