Skip to content

bloom filter bulk inserts and queries #723

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 2 additions & 17 deletions bench/macro/lsm-tree-bench-bloomfilter.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,10 @@ import Text.Printf (printf)

import Database.LSMTree.Extras.Orphans ()
import Database.LSMTree.Internal.Assertions (fromIntegralChecked)
import qualified Database.LSMTree.Internal.BloomFilterQuery1 as Bloom1
import qualified Database.LSMTree.Internal.BloomFilter as Bloom
import Database.LSMTree.Internal.Serialise (SerialisedKey,
serialiseKey)

#ifdef BLOOM_QUERY_FAST
import qualified Database.LSMTree.Internal.BloomFilterQuery2 as Bloom2
#endif

main :: IO ()
main = do
hSetBuffering stdout NoBuffering
Expand Down Expand Up @@ -112,22 +108,11 @@ benchmarks = do
benchmark "bloomQueries1"
"(this is the batch lookup, less the cost of computing and hashing the keys)"
(benchInBatches benchmarkBatchSize rng0
(\ks -> Bloom1.bloomQueries vbs ks `seq` ()))
(\ks -> Bloom.bloomQueries vbs ks `seq` ()))
(fromIntegralChecked benchmarkNumLookups)
hashcost
0

#ifdef BLOOM_QUERY_FAST
_ <-
benchmark "bloomQueries2"
"(this is the optimised batch lookup, less the cost of computing and hashing the keys)"
(benchInBatches benchmarkBatchSize rng0
(\ks -> Bloom2.bloomQueries vbs ks `seq` ()))
(fromIntegralChecked benchmarkNumLookups)
hashcost
0
#endif

return ()

type Alloc = Int
Expand Down
87 changes: 87 additions & 0 deletions bloomfilter/src/Data/BloomFilter/Blocked.hs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ module Data.BloomFilter.Blocked (
new,
maxSizeBits,
insert,
insertMany,

-- ** Conversion
freeze,
Expand All @@ -74,7 +75,9 @@ module Data.BloomFilter.Blocked (
import Control.Monad.Primitive (PrimMonad, PrimState, RealWorld,
stToPrim)
import Control.Monad.ST (ST, runST)
import Data.Bits ((.&.))
import Data.Primitive.ByteArray (MutableByteArray)
import qualified Data.Primitive.PrimArray as P

import Data.BloomFilter.Blocked.Calc
import Data.BloomFilter.Blocked.Internal hiding (deserialise)
Expand Down Expand Up @@ -190,3 +193,87 @@ deserialise bloomsize fill = do
Internal.deserialise mbloom fill
stToPrim $ unsafeFreeze mbloom


-----------------------------------------------------------
-- Bulk insert
--

{-# INLINE insertMany #-}
-- | A bulk insert of many elements.
--
-- This is somewhat faster than repeated insertion using 'insert'. It uses
-- memory prefetching to improve the utilisation of memory bandwidth. This has
-- greatest benefit for large filters (that do not fit in L3 cache) and for
-- inserting many elements, e.g. > 10.
--
-- To get best performance, you probably want to specialise this function to
-- the 'Hashable' instance and the lookup action. It is marked @INLINE@ to help
-- with this. But since it is marked inline, avoid calling it in many locations.
-- Define the specialisations you need and use those.
--
insertMany ::
forall a s.
Hashable a
=> MBloom s a
-> (Int -> ST s a) -- ^ Action to lookup elements, indexed @0..n-1@
-> Int -- ^ @n@, number of elements to insert
-> ST s ()
insertMany bloom key n =
P.newPrimArray 0x10 >>= body
where
-- The general strategy is to use a rolling buffer @buf@ (of size 16). At
-- the write end of the buffer, we prepare the probe locations and prefetch
-- the corresponding cache line. At the read end, we do the hash insert.
-- By having a prefetch distance of 15 between the write and read ends, we
-- can have up to 15 memory reads in flight at once, thus improving
-- utilisation of the memory bandwidth.
body :: P.MutablePrimArray s (Hashes a) -> ST s ()
body !buf = prepareProbes 0 0
where
-- Start by filling the buffer as far as we can, either to the end of
-- the buffer or until we run out of elements.
prepareProbes :: Int -> Int -> ST s ()
prepareProbes !i !i_w
| i_w < 0x0f && i < n = do
k <- key i
let !kh = hashes k
prefetchInsert bloom kh
P.writePrimArray buf i_w kh
prepareProbes (i+1) (i_w+1)

| n > 0 = insertProbe 0 0 i_w
| otherwise = return ()

-- Read from the read end of the buffer and do the inserts.
insertProbe :: Int -> Int -> Int -> ST s ()
insertProbe !i !i_r !i_w = do
kh <- P.readPrimArray buf i_r
insertHashes bloom kh
nextProbe i i_r i_w

-- Move on to the next entry.
nextProbe :: Int -> Int -> Int -> ST s ()
nextProbe !i !i_r !i_w
-- If there are elements left, we prepare them and add them at the
-- write end of the buffer, before inserting the next element
-- (from the read end of the buffer).
| i < n = do
k <- key i
let !kh = hashes k
prefetchInsert bloom kh
P.writePrimArray buf i_w kh
insertProbe
(i+1)
((i_r + 1) .&. 0x0f)
((i_w + 1) .&. 0x0f)

-- Or if there's no more elements to add to the buffer, but the
-- buffer is still non-empty, we just loop draining the buffer.
| ((i_r + 1) .&. 0x0f) /= i_w =
insertProbe
i
((i_r + 1) .&. 0x0f)
i_w

-- When the buffer is empty, we're done.
| otherwise = return ()
16 changes: 12 additions & 4 deletions bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import Data.Primitive.PrimArray
import Data.Word (Word64, Word8)

import GHC.Exts (Int (I#), prefetchByteArray0#,
prefetchMutableByteArray3#)
prefetchMutableByteArray0#)
import GHC.ST (ST (ST))

-- | An array of blocks of bits.
Expand Down Expand Up @@ -83,6 +83,9 @@ prefetchIndex (BitArray (PrimArray ba#)) (BlockIx blockIx) =

assert (i >= 0 && i < sizeofByteArray (ByteArray ba#) - 63) $

-- In prefetchByteArray0, the 0 refers to a "non temporal" load, which is
-- a hint that the value will be used soon, and then not used again (soon).
-- So the caches can evict the value as soon as they like.
ST (\s -> case prefetchByteArray0# ba# i# s of
s' -> (# s', () #))

Expand Down Expand Up @@ -133,8 +136,10 @@ unsafeSet (MBitArray arr) blockIx blockBitIx = do
{-# INLINE prefetchSet #-}
prefetchSet :: MBitArray s -> BlockIx -> ST s ()
prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do
-- For setting, we will do several writes to the same cache line, so
-- read it into all 3 levels of cache.
-- For setting, we will do several writes to the same cache line, but all
-- immediately after each other, after which we will not need the value in
-- the cache again (for a long time). So as with prefetchIndex we want to
-- disturbe the caches the least, and so we use prefetchMutableByteArray0.
let !(I# i#) = fromIntegral blockIx `shiftL` 6
-- blockIx * 64 to go from block index to the byte offset of the beginning
-- of the block. This offset is in bytes, not words.
Expand All @@ -144,7 +149,10 @@ prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do
assert (let i = I# i# in i >= 0 && i < sz-63) $ return ()
#endif

ST (\s -> case prefetchMutableByteArray3# mba# i# s of
-- In prefetchMutableByteArray0, the 0 refers to a "non temporal" load,
-- which is a hint that the value will be used soon, and then not used
-- again (soon). So the caches can evict the value as soon as they like.
ST (\s -> case prefetchMutableByteArray0# mba# i# s of
s' -> (# s', () #))

freeze :: MBitArray s -> ST s BitArray
Expand Down
20 changes: 20 additions & 0 deletions bloomfilter/tests/bloomfilter-tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ tests =
test_calculations proxyBlocked
(FPR 1e-4, FPR 1e-1) (BitsPerEntry 3, BitsPerEntry 24) 1e-2
, test_fromList proxyBlocked
, testProperty "prop_insertMany" prop_insertMany
]
, tests_hashes
]
Expand Down Expand Up @@ -240,6 +241,25 @@ prop_rechunked f s =
prop_rechunked_eq :: LBS.ByteString -> Property
prop_rechunked_eq = prop_rechunked hash64

-------------------------------------------------------------------------------
-- Bulk operations
-------------------------------------------------------------------------------

-- Currently only for Bloom.Blocked.
prop_insertMany :: FPR -> [Word64] -> Property
prop_insertMany (FPR fpr) keys =
bloom_insert === bloom_insertMany
where
bloom_insert =
Bloom.Blocked.create (Bloom.Blocked.sizeForFPR fpr n) $ \mb ->
mapM_ (Bloom.Blocked.insert mb) keys

bloom_insertMany =
Bloom.Blocked.create (Bloom.Blocked.sizeForFPR fpr n) $ \mb ->
Bloom.Blocked.insertMany mb (\k -> pure $ keys !! k) n

!n = length keys

-------------------------------------------------------------------------------
-- Class to allow testing two filter implementations
-------------------------------------------------------------------------------
Expand Down
26 changes: 3 additions & 23 deletions lsm-tree.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -262,19 +262,8 @@ common language
RoleAnnotations
ViewPatterns

flag bloom-query-fast
description: Use an optimised Bloom filter query implementation
default: True
manual: True

common bloom-query-fast
if (flag(bloom-query-fast) && impl(ghc >=9.4))

--TODO: temporarily disabled:
--cpp-options: -DBLOOM_QUERY_FAST

library
import: language, warnings, wno-x-partial, bloom-query-fast
import: language, warnings, wno-x-partial
hs-source-dirs: src
exposed-modules:
Database.LSMTree
Expand All @@ -284,7 +273,6 @@ library
Database.LSMTree.Internal.BlobFile
Database.LSMTree.Internal.BlobRef
Database.LSMTree.Internal.BloomFilter
Database.LSMTree.Internal.BloomFilterQuery1
Database.LSMTree.Internal.ByteString
Database.LSMTree.Internal.ChecksumHandle
Database.LSMTree.Internal.Chunk
Expand Down Expand Up @@ -360,14 +348,6 @@ library
, vector ^>=0.13
, vector-algorithms ^>=0.9

if (flag(bloom-query-fast) && impl(ghc >=9.4))
-- The bulk bloom filter query uses some fancy stuff
exposed-modules: Database.LSMTree.Internal.StrictArray

--TODO: temporarily disabled
-- Database.LSMTree.Internal.BloomFilterQuery2
build-depends: data-elevator ^>=0.1.0.2 || ^>=0.2

-- this exists due windows
library xxhash
import: language
Expand Down Expand Up @@ -524,7 +504,7 @@ library extras
, wide-word

test-suite lsm-tree-test
import: language, warnings, wno-x-partial, bloom-query-fast
import: language, warnings, wno-x-partial
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: Main.hs
Expand Down Expand Up @@ -681,7 +661,7 @@ benchmark lsm-tree-micro-bench
ghc-options: -rtsopts -with-rtsopts=-T -threaded

benchmark lsm-tree-bench-bloomfilter
import: language, warnings, wno-x-partial, bloom-query-fast
import: language, warnings, wno-x-partial
type: exitcode-stdio-1.0
hs-source-dirs: bench/macro
main-is: lsm-tree-bench-bloomfilter.hs
Expand Down
Loading