IntersectMBO · dcoutts · May 21, 2025 · May 14, 2025 · May 14, 2025 · May 15, 2025
@@ -25,14 +25,10 @@ import           Text.Printf (printf)
 
 import           Database.LSMTree.Extras.Orphans ()
 import           Database.LSMTree.Internal.Assertions (fromIntegralChecked)
-import qualified Database.LSMTree.Internal.BloomFilterQuery1 as Bloom1
+import qualified Database.LSMTree.Internal.BloomFilter as Bloom
 import           Database.LSMTree.Internal.Serialise (SerialisedKey,
                      serialiseKey)
 
-#ifdef BLOOM_QUERY_FAST
-import qualified Database.LSMTree.Internal.BloomFilterQuery2 as Bloom2
-#endif
-
 main :: IO ()
 main = do
   hSetBuffering stdout NoBuffering
@@ -109,25 +105,14 @@ benchmarks = do
                 0
 
     _ <-
-      benchmark "bloomQueries1"
+      benchmark "bloomQueries"
                 "(this is the batch lookup, less the cost of computing and hashing the keys)"
                 (benchInBatches benchmarkBatchSize rng0
-                  (\ks -> Bloom1.bloomQueries vbs ks `seq` ()))
+                  (\ks -> Bloom.bloomQueries vbs ks `seq` ()))
                 (fromIntegralChecked benchmarkNumLookups)
                 hashcost
                 0
 
-#ifdef BLOOM_QUERY_FAST
-    _ <-
-      benchmark "bloomQueries2"
-                "(this is the optimised batch lookup, less the cost of computing and hashing the keys)"
-                (benchInBatches benchmarkBatchSize rng0
-                  (\ks -> Bloom2.bloomQueries vbs ks `seq` ()))
-                (fromIntegralChecked benchmarkNumLookups)
-                hashcost
-                0
-#endif
-
     return ()
 
 type Alloc = Int

@@ -55,6 +55,7 @@ module Data.BloomFilter.Blocked (
     new,
     maxSizeBits,
     insert,
+    insertMany,
 
     -- ** Conversion
     freeze,
@@ -74,7 +75,9 @@ module Data.BloomFilter.Blocked (
 import           Control.Monad.Primitive (PrimMonad, PrimState, RealWorld,
                      stToPrim)
 import           Control.Monad.ST (ST, runST)
+import           Data.Bits ((.&.))
 import           Data.Primitive.ByteArray (MutableByteArray)
+import qualified Data.Primitive.PrimArray as P
 
 import           Data.BloomFilter.Blocked.Calc
 import           Data.BloomFilter.Blocked.Internal hiding (deserialise)
@@ -190,3 +193,86 @@ deserialise bloomsize fill = do
     Internal.deserialise mbloom fill
     stToPrim $ unsafeFreeze mbloom
 
+
+-----------------------------------------------------------
+-- Bulk insert
+--
+
+{-# INLINABLE insertMany #-}
+-- | A bulk insert of many elements.
+--
+-- This is somewhat faster than repeated insertion using 'insert'. It uses
+-- memory prefetching to improve the utilisation of memory bandwidth. This has
+-- greatest benefit for large filters (that do not fit in L3 cache) and for
+-- inserting many elements, e.g. > 10.
+--
+-- To get best performance, you probably want to specialise this function to
+-- the 'Hashable' instance and to the lookup action. It is marked @INLINABLE@
+-- to help with this.
+--
+insertMany ::
+     forall a s.
+     Hashable a
+  => MBloom s a
+  -> (Int -> ST s a) -- ^ Action to lookup elements, indexed @0..n-1@
+  -> Int             -- ^ @n@, number of elements to insert
+  -> ST s ()
+insertMany bloom key n =
+    P.newPrimArray 0x10 >>= body
+  where
+    -- The general strategy is to use a rolling buffer @buf@ (of size 16). At
+    -- the write end of the buffer, we prepare the probe locations and prefetch
+    -- the corresponding cache line. At the read end, we do the hash insert.
+    -- By having a prefetch distance of 15 between the write and read ends, we
+    -- can have up to 15 memory reads in flight at once, thus improving
+    -- utilisation of the memory bandwidth.
+    body :: P.MutablePrimArray s (Hashes a) -> ST s ()
+    body !buf = prepareProbes 0 0
+      where
+        -- Start by filling the buffer as far as we can, either to the end of
+        -- the buffer or until we run out of elements.
+        prepareProbes :: Int -> Int -> ST s ()
+        prepareProbes !i !i_w
+          | i_w < 0x0f && i < n = do
+              k <- key i
+              let !kh = hashes k
+              prefetchInsert bloom kh
+              P.writePrimArray buf i_w kh
+              prepareProbes (i+1) (i_w+1)
+
+          | n > 0     = insertProbe 0 0 i_w
+          | otherwise = return ()
+
+        -- Read from the read end of the buffer and do the inserts.
+        insertProbe :: Int -> Int -> Int -> ST s ()
+        insertProbe !i !i_r !i_w = do
+            kh <- P.readPrimArray buf i_r
+            insertHashes bloom kh
+            nextProbe i i_r i_w
+
+        -- Move on to the next entry.
+        nextProbe :: Int -> Int -> Int -> ST s ()
+        nextProbe !i !i_r !i_w
+          -- If there are elements left, we prepare them and add them at the
+          -- write end of the buffer, before inserting the next element
+          -- (from the read end of the buffer).
+          | i < n = do
+              k <- key i
+              let !kh = hashes k
+              prefetchInsert bloom kh
+              P.writePrimArray buf i_w kh
+              insertProbe
+                (i+1)
+                ((i_r + 1) .&. 0x0f)
+                ((i_w + 1) .&. 0x0f)
+
+          -- Or if there's no more elements to add to the buffer, but the
+          -- buffer is still non-empty, we just loop draining the buffer.
+          | ((i_r + 1) .&. 0x0f) /= i_w =
+              insertProbe
+                i
+                ((i_r + 1) .&. 0x0f)
+                i_w
+
+          -- When the buffer is empty, we're done.
+          | otherwise = return ()
@@ -34,7 +34,7 @@ import           Data.Primitive.PrimArray
 import           Data.Word (Word64, Word8)
 
 import           GHC.Exts (Int (I#), prefetchByteArray0#,
-                     prefetchMutableByteArray3#)
+                     prefetchMutableByteArray0#)
 import           GHC.ST (ST (ST))
 
 -- | An array of blocks of bits.
@@ -83,6 +83,9 @@ prefetchIndex (BitArray (PrimArray ba#)) (BlockIx blockIx) =
 
     assert (i >= 0 && i < sizeofByteArray (ByteArray ba#) - 63) $
 
+    -- In prefetchByteArray0, the 0 refers to a "non temporal" load, which is
+    -- a hint that the value will be used soon, and then not used again (soon).
+    -- So the caches can evict the value as soon as they like.
     ST (\s -> case prefetchByteArray0# ba# i# s of
                 s' -> (# s', () #))
 
@@ -133,8 +136,10 @@ unsafeSet (MBitArray arr) blockIx blockBitIx = do
 {-# INLINE prefetchSet #-}
 prefetchSet :: MBitArray s -> BlockIx -> ST s ()
 prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do
-    -- For setting, we will do several writes to the same cache line, so
-    -- read it into all 3 levels of cache.
+    -- For setting, we will do several writes to the same cache line, but all
+    -- immediately after each other, after which we will not need the value in
+    -- the cache again (for a long time). So as with prefetchIndex we want to
+    -- disturbe the caches the least, and so we use prefetchMutableByteArray0.
     let !(I# i#) = fromIntegral blockIx `shiftL` 6
     -- blockIx * 64 to go from block index to the byte offset of the beginning
     -- of the block. This offset is in bytes, not words.
@@ -144,7 +149,10 @@ prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do
     assert (let i = I# i# in i >= 0 && i < sz-63) $ return ()
 #endif
 
-    ST (\s -> case prefetchMutableByteArray3# mba# i# s of
+    -- In prefetchMutableByteArray0, the 0 refers to a "non temporal" load,
+    -- which is a hint that the value will be used soon, and then not used
+    -- again (soon). So the caches can evict the value as soon as they like.
+    ST (\s -> case prefetchMutableByteArray0# mba# i# s of
                 s' -> (# s', () #))
 
 freeze :: MBitArray s -> ST s BitArray

@@ -44,6 +44,7 @@ tests =
             test_calculations proxyBlocked
               (FPR 1e-4, FPR 1e-1) (BitsPerEntry 3, BitsPerEntry 24) 1e-2
         , test_fromList     proxyBlocked
+        , testProperty "prop_insertMany" prop_insertMany
         ]
     , tests_hashes
     ]
@@ -240,6 +241,25 @@ prop_rechunked f s =
 prop_rechunked_eq :: LBS.ByteString -> Property
 prop_rechunked_eq = prop_rechunked hash64
 
+-------------------------------------------------------------------------------
+-- Bulk operations
+-------------------------------------------------------------------------------
+
+-- Currently only for Bloom.Blocked.
+prop_insertMany :: FPR -> [Word64] -> Property
+prop_insertMany (FPR fpr) keys =
+     bloom_insert === bloom_insertMany
+  where
+    bloom_insert =
+      Bloom.Blocked.create (Bloom.Blocked.sizeForFPR fpr n) $ \mb ->
+        mapM_ (Bloom.Blocked.insert mb) keys
+
+    bloom_insertMany =
+      Bloom.Blocked.create (Bloom.Blocked.sizeForFPR fpr n) $ \mb ->
+        Bloom.Blocked.insertMany mb (\k -> pure $ keys !! k) n
+
+    !n = length keys
+
 -------------------------------------------------------------------------------
 -- Class to allow testing two filter implementations
 -------------------------------------------------------------------------------

@@ -262,19 +262,8 @@ common language
     RoleAnnotations
     ViewPatterns
 
-flag bloom-query-fast
-  description: Use an optimised Bloom filter query implementation
-  default:     True
-  manual:      True
-
-common bloom-query-fast
-  if (flag(bloom-query-fast) && impl(ghc >=9.4))
-
---TODO: temporarily disabled:
---cpp-options: -DBLOOM_QUERY_FAST
-
 library
-  import:          language, warnings, wno-x-partial, bloom-query-fast
+  import:          language, warnings, wno-x-partial
   hs-source-dirs:  src
   exposed-modules:
     Database.LSMTree
@@ -284,7 +273,6 @@ library
     Database.LSMTree.Internal.BlobFile
     Database.LSMTree.Internal.BlobRef
     Database.LSMTree.Internal.BloomFilter
-    Database.LSMTree.Internal.BloomFilterQuery1
     Database.LSMTree.Internal.ByteString
     Database.LSMTree.Internal.ChecksumHandle
     Database.LSMTree.Internal.Chunk
@@ -360,13 +348,10 @@ library
     , vector                  ^>=0.13
     , vector-algorithms       ^>=0.9
 
-  if (flag(bloom-query-fast) && impl(ghc >=9.4))
-    -- The bulk bloom filter query uses some fancy stuff
-    exposed-modules: Database.LSMTree.Internal.StrictArray
-
-    --TODO: temporarily disabled
-    --  Database.LSMTree.Internal.BloomFilterQuery2
-    build-depends:   data-elevator ^>=0.1.0.2 || ^>=0.2
+  if impl(ghc >=9.4)
+    other-modules: Database.LSMTree.Internal.StrictArray
+    build-depends: data-elevator ^>=0.1.0.2 || ^>=0.2
+    cpp-options:   -DHAVE_STRICT_ARRAY
 
 -- this exists due windows
 library xxhash
@@ -524,7 +509,7 @@ library extras
     , wide-word
 
 test-suite lsm-tree-test
-  import:         language, warnings, wno-x-partial, bloom-query-fast
+  import:         language, warnings, wno-x-partial
   type:           exitcode-stdio-1.0
   hs-source-dirs: test
   main-is:        Main.hs
@@ -681,7 +666,7 @@ benchmark lsm-tree-micro-bench
   ghc-options:    -rtsopts -with-rtsopts=-T -threaded
 
 benchmark lsm-tree-bench-bloomfilter
-  import:         language, warnings, wno-x-partial, bloom-query-fast
+  import:         language, warnings, wno-x-partial
   type:           exitcode-stdio-1.0
   hs-source-dirs: bench/macro
   main-is:        lsm-tree-bench-bloomfilter.hs