diff --git a/bench/macro/lsm-tree-bench-bloomfilter.hs b/bench/macro/lsm-tree-bench-bloomfilter.hs
index aee5830e8..efe230c04 100644
--- a/bench/macro/lsm-tree-bench-bloomfilter.hs
+++ b/bench/macro/lsm-tree-bench-bloomfilter.hs
@@ -25,14 +25,10 @@ import           Text.Printf (printf)
 
 import           Database.LSMTree.Extras.Orphans ()
 import           Database.LSMTree.Internal.Assertions (fromIntegralChecked)
-import qualified Database.LSMTree.Internal.BloomFilterQuery1 as Bloom1
+import qualified Database.LSMTree.Internal.BloomFilter as Bloom
 import           Database.LSMTree.Internal.Serialise (SerialisedKey,
                      serialiseKey)
 
-#ifdef BLOOM_QUERY_FAST
-import qualified Database.LSMTree.Internal.BloomFilterQuery2 as Bloom2
-#endif
-
 main :: IO ()
 main = do
   hSetBuffering stdout NoBuffering
@@ -112,22 +108,11 @@ benchmarks = do
       benchmark "bloomQueries1"
                 "(this is the batch lookup, less the cost of computing and hashing the keys)"
                 (benchInBatches benchmarkBatchSize rng0
-                  (\ks -> Bloom1.bloomQueries vbs ks `seq` ()))
+                  (\ks -> Bloom.bloomQueries vbs ks `seq` ()))
                 (fromIntegralChecked benchmarkNumLookups)
                 hashcost
                 0
 
-#ifdef BLOOM_QUERY_FAST
-    _ <-
-      benchmark "bloomQueries2"
-                "(this is the optimised batch lookup, less the cost of computing and hashing the keys)"
-                (benchInBatches benchmarkBatchSize rng0
-                  (\ks -> Bloom2.bloomQueries vbs ks `seq` ()))
-                (fromIntegralChecked benchmarkNumLookups)
-                hashcost
-                0
-#endif
-
     return ()
 
 type Alloc = Int
diff --git a/bloomfilter/src/Data/BloomFilter/Blocked.hs b/bloomfilter/src/Data/BloomFilter/Blocked.hs
index 168a03119..18f2c2ab7 100644
--- a/bloomfilter/src/Data/BloomFilter/Blocked.hs
+++ b/bloomfilter/src/Data/BloomFilter/Blocked.hs
@@ -55,6 +55,7 @@ module Data.BloomFilter.Blocked (
     new,
     maxSizeBits,
     insert,
+    insertMany,
 
     -- ** Conversion
     freeze,
@@ -74,7 +75,9 @@ module Data.BloomFilter.Blocked (
 import           Control.Monad.Primitive (PrimMonad, PrimState, RealWorld,
                      stToPrim)
 import           Control.Monad.ST (ST, runST)
+import           Data.Bits ((.&.))
 import           Data.Primitive.ByteArray (MutableByteArray)
+import qualified Data.Primitive.PrimArray as P
 
 import           Data.BloomFilter.Blocked.Calc
 import           Data.BloomFilter.Blocked.Internal hiding (deserialise)
@@ -190,3 +193,87 @@ deserialise bloomsize fill = do
     Internal.deserialise mbloom fill
     stToPrim $ unsafeFreeze mbloom
 
+
+-----------------------------------------------------------
+-- Bulk insert
+--
+
+{-# INLINE insertMany #-}
+-- | A bulk insert of many elements.
+--
+-- This is somewhat faster than repeated insertion using 'insert'. It uses
+-- memory prefetching to improve the utilisation of memory bandwidth. This has
+-- greatest benefit for large filters (that do not fit in L3 cache) and for
+-- inserting many elements, e.g. > 10.
+--
+-- To get best performance, you probably want to specialise this function to
+-- the 'Hashable' instance and the lookup action. It is marked @INLINE@ to help
+-- with this. But since it is marked inline, avoid calling it in many locations.
+-- Define the specialisations you need and use those.
+--
+insertMany ::
+     forall a s.
+     Hashable a
+  => MBloom s a
+  -> (Int -> ST s a) -- ^ Action to lookup elements, indexed @0..n-1@
+  -> Int             -- ^ @n@, number of elements to insert
+  -> ST s ()
+insertMany bloom key n =
+    P.newPrimArray 0x10 >>= body
+  where
+    -- The general strategy is to use a rolling buffer @buf@ (of size 16). At
+    -- the write end of the buffer, we prepare the probe locations and prefetch
+    -- the corresponding cache line. At the read end, we do the hash insert.
+    -- By having a prefetch distance of 15 between the write and read ends, we
+    -- can have up to 15 memory reads in flight at once, thus improving
+    -- utilisation of the memory bandwidth.
+    body :: P.MutablePrimArray s (Hashes a) -> ST s ()
+    body !buf = prepareProbes 0 0
+      where
+        -- Start by filling the buffer as far as we can, either to the end of
+        -- the buffer or until we run out of elements.
+        prepareProbes :: Int -> Int -> ST s ()
+        prepareProbes !i !i_w
+          | i_w < 0x0f && i < n = do
+              k <- key i
+              let !kh = hashes k
+              prefetchInsert bloom kh
+              P.writePrimArray buf i_w kh
+              prepareProbes (i+1) (i_w+1)
+
+          | n > 0     = insertProbe 0 0 i_w
+          | otherwise = return ()
+
+        -- Read from the read end of the buffer and do the inserts.
+        insertProbe :: Int -> Int -> Int -> ST s ()
+        insertProbe !i !i_r !i_w = do
+            kh <- P.readPrimArray buf i_r
+            insertHashes bloom kh
+            nextProbe i i_r i_w
+
+        -- Move on to the next entry.
+        nextProbe :: Int -> Int -> Int -> ST s ()
+        nextProbe !i !i_r !i_w
+          -- If there are elements left, we prepare them and add them at the
+          -- write end of the buffer, before inserting the next element
+          -- (from the read end of the buffer).
+          | i < n = do
+              k <- key i
+              let !kh = hashes k
+              prefetchInsert bloom kh
+              P.writePrimArray buf i_w kh
+              insertProbe
+                (i+1)
+                ((i_r + 1) .&. 0x0f)
+                ((i_w + 1) .&. 0x0f)
+
+          -- Or if there's no more elements to add to the buffer, but the
+          -- buffer is still non-empty, we just loop draining the buffer.
+          | ((i_r + 1) .&. 0x0f) /= i_w =
+              insertProbe
+                i
+                ((i_r + 1) .&. 0x0f)
+                i_w
+
+          -- When the buffer is empty, we're done.
+          | otherwise = return ()
diff --git a/bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs b/bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs
index e7876ee4e..e5c97860e 100644
--- a/bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs
+++ b/bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs
@@ -34,7 +34,7 @@ import           Data.Primitive.PrimArray
 import           Data.Word (Word64, Word8)
 
 import           GHC.Exts (Int (I#), prefetchByteArray0#,
-                     prefetchMutableByteArray3#)
+                     prefetchMutableByteArray0#)
 import           GHC.ST (ST (ST))
 
 -- | An array of blocks of bits.
@@ -83,6 +83,9 @@ prefetchIndex (BitArray (PrimArray ba#)) (BlockIx blockIx) =
 
     assert (i >= 0 && i < sizeofByteArray (ByteArray ba#) - 63) $
 
+    -- In prefetchByteArray0, the 0 refers to a "non temporal" load, which is
+    -- a hint that the value will be used soon, and then not used again (soon).
+    -- So the caches can evict the value as soon as they like.
     ST (\s -> case prefetchByteArray0# ba# i# s of
                 s' -> (# s', () #))
 
@@ -133,8 +136,10 @@ unsafeSet (MBitArray arr) blockIx blockBitIx = do
 {-# INLINE prefetchSet #-}
 prefetchSet :: MBitArray s -> BlockIx -> ST s ()
 prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do
-    -- For setting, we will do several writes to the same cache line, so
-    -- read it into all 3 levels of cache.
+    -- For setting, we will do several writes to the same cache line, but all
+    -- immediately after each other, after which we will not need the value in
+    -- the cache again (for a long time). So as with prefetchIndex we want to
+    -- disturbe the caches the least, and so we use prefetchMutableByteArray0.
     let !(I# i#) = fromIntegral blockIx `shiftL` 6
     -- blockIx * 64 to go from block index to the byte offset of the beginning
     -- of the block. This offset is in bytes, not words.
@@ -144,7 +149,10 @@ prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do
     assert (let i = I# i# in i >= 0 && i < sz-63) $ return ()
 #endif
 
-    ST (\s -> case prefetchMutableByteArray3# mba# i# s of
+    -- In prefetchMutableByteArray0, the 0 refers to a "non temporal" load,
+    -- which is a hint that the value will be used soon, and then not used
+    -- again (soon). So the caches can evict the value as soon as they like.
+    ST (\s -> case prefetchMutableByteArray0# mba# i# s of
                 s' -> (# s', () #))
 
 freeze :: MBitArray s -> ST s BitArray
diff --git a/bloomfilter/tests/bloomfilter-tests.hs b/bloomfilter/tests/bloomfilter-tests.hs
index 5ef556bd0..cc8aa1fcc 100644
--- a/bloomfilter/tests/bloomfilter-tests.hs
+++ b/bloomfilter/tests/bloomfilter-tests.hs
@@ -44,6 +44,7 @@ tests =
             test_calculations proxyBlocked
               (FPR 1e-4, FPR 1e-1) (BitsPerEntry 3, BitsPerEntry 24) 1e-2
         , test_fromList     proxyBlocked
+        , testProperty "prop_insertMany" prop_insertMany
         ]
     , tests_hashes
     ]
@@ -240,6 +241,25 @@ prop_rechunked f s =
 prop_rechunked_eq :: LBS.ByteString -> Property
 prop_rechunked_eq = prop_rechunked hash64
 
+-------------------------------------------------------------------------------
+-- Bulk operations
+-------------------------------------------------------------------------------
+
+-- Currently only for Bloom.Blocked.
+prop_insertMany :: FPR -> [Word64] -> Property
+prop_insertMany (FPR fpr) keys =
+     bloom_insert === bloom_insertMany
+  where
+    bloom_insert =
+      Bloom.Blocked.create (Bloom.Blocked.sizeForFPR fpr n) $ \mb ->
+        mapM_ (Bloom.Blocked.insert mb) keys
+
+    bloom_insertMany =
+      Bloom.Blocked.create (Bloom.Blocked.sizeForFPR fpr n) $ \mb ->
+        Bloom.Blocked.insertMany mb (\k -> pure $ keys !! k) n
+
+    !n = length keys
+
 -------------------------------------------------------------------------------
 -- Class to allow testing two filter implementations
 -------------------------------------------------------------------------------
diff --git a/lsm-tree.cabal b/lsm-tree.cabal
index 145044789..ad0608ac6 100644
--- a/lsm-tree.cabal
+++ b/lsm-tree.cabal
@@ -262,19 +262,8 @@ common language
     RoleAnnotations
     ViewPatterns
 
-flag bloom-query-fast
-  description: Use an optimised Bloom filter query implementation
-  default:     True
-  manual:      True
-
-common bloom-query-fast
-  if (flag(bloom-query-fast) && impl(ghc >=9.4))
-
---TODO: temporarily disabled:
---cpp-options: -DBLOOM_QUERY_FAST
-
 library
-  import:          language, warnings, wno-x-partial, bloom-query-fast
+  import:          language, warnings, wno-x-partial
   hs-source-dirs:  src
   exposed-modules:
     Database.LSMTree
@@ -284,7 +273,6 @@ library
     Database.LSMTree.Internal.BlobFile
     Database.LSMTree.Internal.BlobRef
     Database.LSMTree.Internal.BloomFilter
-    Database.LSMTree.Internal.BloomFilterQuery1
     Database.LSMTree.Internal.ByteString
     Database.LSMTree.Internal.ChecksumHandle
     Database.LSMTree.Internal.Chunk
@@ -360,14 +348,6 @@ library
     , vector                  ^>=0.13
     , vector-algorithms       ^>=0.9
 
-  if (flag(bloom-query-fast) && impl(ghc >=9.4))
-    -- The bulk bloom filter query uses some fancy stuff
-    exposed-modules: Database.LSMTree.Internal.StrictArray
-
-    --TODO: temporarily disabled
-    --  Database.LSMTree.Internal.BloomFilterQuery2
-    build-depends:   data-elevator ^>=0.1.0.2 || ^>=0.2
-
 -- this exists due windows
 library xxhash
   import:          language
@@ -524,7 +504,7 @@ library extras
     , wide-word
 
 test-suite lsm-tree-test
-  import:         language, warnings, wno-x-partial, bloom-query-fast
+  import:         language, warnings, wno-x-partial
   type:           exitcode-stdio-1.0
   hs-source-dirs: test
   main-is:        Main.hs
@@ -681,7 +661,7 @@ benchmark lsm-tree-micro-bench
   ghc-options:    -rtsopts -with-rtsopts=-T -threaded
 
 benchmark lsm-tree-bench-bloomfilter
-  import:         language, warnings, wno-x-partial, bloom-query-fast
+  import:         language, warnings, wno-x-partial
   type:           exitcode-stdio-1.0
   hs-source-dirs: bench/macro
   main-is:        lsm-tree-bench-bloomfilter.hs
diff --git a/src/Database/LSMTree/Internal/BloomFilter.hs b/src/Database/LSMTree/Internal/BloomFilter.hs
index 7fedb0db7..d577fcba5 100644
--- a/src/Database/LSMTree/Internal/BloomFilter.hs
+++ b/src/Database/LSMTree/Internal/BloomFilter.hs
@@ -1,27 +1,164 @@
+{-# LANGUAGE MagicHash       #-}
+{-# LANGUAGE PatternSynonyms #-}
+{-# LANGUAGE UnboxedTuples   #-}
 {-# OPTIONS_HADDOCK not-home #-}
 
 module Database.LSMTree.Internal.BloomFilter (
-  BF.Bloom,
-  BF.MBloom,
+  -- * Types
+  Bloom.Bloom,
+  Bloom.MBloom,
+
+  -- * Bulk query
+  bloomQueries,
+  RunIxKeyIx(RunIxKeyIx),
+  RunIx, KeyIx,
+
+  -- * Serialisation
   bloomFilterVersion,
   bloomFilterToLBS,
   bloomFilterFromFile,
 ) where
 
-import           Control.Monad (void, when)
-import           Control.Monad.Class.MonadThrow
-import           Control.Monad.Primitive (PrimMonad)
+import           Data.Bits
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.Builder.Extra as B
 import qualified Data.ByteString.Lazy as LBS
-import qualified Data.Primitive.ByteArray as P
+import qualified Data.Primitive as P
+import qualified Data.Vector as V
+import qualified Data.Vector.Primitive as VP
 import           Data.Word (Word32, Word64, byteSwap32)
+
+import           Control.Exception (assert)
+import           Control.Monad (void, when)
+import           Control.Monad.Class.MonadThrow
+import           Control.Monad.Primitive (PrimMonad)
+import           Control.Monad.ST (ST, runST)
 import           System.FS.API
 
-import qualified Data.BloomFilter.Blocked as BF
+import           Data.BloomFilter.Blocked (Bloom)
+import qualified Data.BloomFilter.Blocked as Bloom
 import           Database.LSMTree.Internal.ByteString (byteArrayToByteString)
 import           Database.LSMTree.Internal.CRC32C (FileCorruptedError (..),
                      FileFormat (..))
+import           Database.LSMTree.Internal.Serialise (SerialisedKey)
+import qualified Database.LSMTree.Internal.Vector as P
+
+import           Prelude hiding (filter)
+
+-- Bulk query
+-----------------------------------------------------------
+
+type KeyIx = Int
+type RunIx = Int
+
+-- | A 'RunIxKeyIx' is a (compact) pair of a 'RunIx' and a 'KeyIx'.
+--
+-- We represent it as a 32bit word, using:
+--
+-- * 16 bits for the run\/filter index (MSB)
+-- * 16 bits for the key index (LSB)
+--
+newtype RunIxKeyIx = MkRunIxKeyIx Word32
+  deriving stock Eq
+  deriving newtype P.Prim
+
+pattern RunIxKeyIx :: RunIx -> KeyIx -> RunIxKeyIx
+pattern RunIxKeyIx r k <- (unpackRunIxKeyIx -> (r, k))
+  where
+    RunIxKeyIx r k = packRunIxKeyIx r k
+{-# INLINE RunIxKeyIx #-}
+{-# COMPLETE RunIxKeyIx #-}
+
+packRunIxKeyIx :: Int -> Int -> RunIxKeyIx
+packRunIxKeyIx r k =
+    assert (r >= 0 && r <= 0xffff
+         && k >= 0 && k <= 0xffff) $
+    MkRunIxKeyIx $
+      (fromIntegral :: Word -> Word32) $
+        (fromIntegral r `unsafeShiftL` 16)
+     .|. fromIntegral k
+{-# INLINE packRunIxKeyIx #-}
+
+unpackRunIxKeyIx :: RunIxKeyIx -> (Int, Int)
+unpackRunIxKeyIx (MkRunIxKeyIx c) =
+    ( fromIntegral (c `unsafeShiftR` 16)
+    , fromIntegral (c .&. 0xfff)
+    )
+{-# INLINE unpackRunIxKeyIx #-}
+
+instance Show RunIxKeyIx where
+  showsPrec _ (RunIxKeyIx r k) =
+    showString "RunIxKeyIx " . showsPrec 11 r
+              . showChar ' ' . showsPrec 11 k
+
+type ResIx = Int -- Result index
+
+-- | Perform a batch of bloom queries. The result is a tuple of indexes into the
+-- vector of runs and vector of keys respectively. The order of keys and
+-- runs\/filters in the input is maintained in the output. This implementation
+-- produces results in key-major order.
+--
+-- The result vector can be of variable length. The initial estimate is 2x the
+-- number of keys but this is grown if needed (using a doubling strategy).
+--
+bloomQueries ::
+     V.Vector (Bloom SerialisedKey)
+  -> V.Vector SerialisedKey
+  -> VP.Vector RunIxKeyIx
+bloomQueries !filters !keys | V.null filters || V.null keys = VP.empty
+bloomQueries !filters !keys =
+    runST $ do
+      res  <- P.newPrimArray (ksN * 2)
+      res' <- loop1 res 0 0
+      parr <- P.unsafeFreezePrimArray res'
+      pure $! P.primArrayToPrimVector parr
+  where
+    !rsN = V.length filters
+    !ksN = V.length keys
+    !keyhashes = P.generatePrimArray (V.length keys) $ \i ->
+                   Bloom.hashes (V.unsafeIndex keys i)
+
+    -- loop over all filters
+    loop1 ::
+         P.MutablePrimArray s RunIxKeyIx
+      -> ResIx
+      -> RunIx
+      -> ST s (P.MutablePrimArray s RunIxKeyIx)
+    loop1 !res !resix !rix | rix == rsN = P.resizeMutablePrimArray res resix
+    loop1 !res !resix !rix = do
+        loop2_prefetch 0
+        (res', resix') <- loop2 res resix 0
+        loop1 res' resix' (rix+1)
+      where
+        !filter = V.unsafeIndex filters rix
+
+        -- loop over all keys
+        loop2 ::
+             P.MutablePrimArray s RunIxKeyIx
+          -> ResIx
+          -> KeyIx
+          -> ST s (P.MutablePrimArray s RunIxKeyIx, ResIx)
+        loop2 !res2 !resix2 !kix
+          | kix == ksN = pure (res2, resix2)
+          | let !keyhash = P.indexPrimArray keyhashes kix
+          , Bloom.elemHashes filter keyhash = do
+              P.writePrimArray res2 resix2 (RunIxKeyIx rix kix)
+              ressz2 <- P.getSizeofMutablePrimArray res2
+              res2'  <- if resix2+1 < ressz2
+                         then return res2
+                         else P.resizeMutablePrimArray res2 (ressz2 * 2)
+              loop2 res2' (resix2+1) (kix+1)
+
+          | otherwise =
+              loop2 res2 resix2 (kix+1)
+
+        loop2_prefetch :: KeyIx -> ST s ()
+        loop2_prefetch !kix
+          | kix == ksN = pure ()
+          | otherwise  = do
+              let !keyhash = P.indexPrimArray keyhashes kix
+              Bloom.prefetchElem filter keyhash
+              loop2_prefetch (kix+1)
 
 -- serialising
 -----------------------------------------------------------
@@ -29,19 +166,19 @@ import           Database.LSMTree.Internal.CRC32C (FileCorruptedError (..),
 -- | By writing out the version in host endianness, we also indicate endianness.
 -- During deserialisation, we would discover an endianness mismatch.
 --
--- We base our version number on the 'BF.formatVersion' from the @bloomfilter@
+-- We base our version number on the 'Bloom.formatVersion' from the @bloomfilter@
 -- library, plus our own version here. This accounts both for changes in the
 -- format code here, and changes in the library.
 --
 bloomFilterVersion :: Word32
-bloomFilterVersion = 1 + fromIntegral BF.formatVersion
+bloomFilterVersion = 1 + fromIntegral Bloom.formatVersion
 
-bloomFilterToLBS :: BF.Bloom a -> LBS.ByteString
+bloomFilterToLBS :: Bloom a -> LBS.ByteString
 bloomFilterToLBS bf =
-    let (size, ba, off, len) = BF.serialise bf
+    let (size, ba, off, len) = Bloom.serialise bf
      in header size <> byteArrayToLBS ba off len
   where
-    header BF.BloomSize { sizeBits, sizeHashes } =
+    header Bloom.BloomSize { sizeBits, sizeHashes } =
         -- creates a single 16 byte chunk
         B.toLazyByteStringWith (B.safeStrategy 16 B.smallChunkSize) mempty $
              B.word32Host bloomFilterVersion
@@ -58,14 +195,14 @@ bloomFilterToLBS bf =
 {-# SPECIALISE bloomFilterFromFile ::
      HasFS IO h
   -> Handle h
-  -> IO (BF.Bloom a) #-}
--- | Read a 'BF.Bloom' from a file.
+  -> IO (Bloom a) #-}
+-- | Read a 'Bloom' from a file.
 --
 bloomFilterFromFile ::
      (PrimMonad m, MonadCatch m)
   => HasFS m h
   -> Handle h  -- ^ The open file, in read mode
-  -> m (BF.Bloom a)
+  -> m (Bloom a)
 bloomFilterFromFile hfs h = do
     header <- rethrowEOFError "Doesn't contain a header" $
               hGetByteArrayExactly hfs h 16
@@ -87,10 +224,10 @@ bloomFilterFromFile hfs h = do
 
     -- read the filter data from the file directly into the bloom filter
     bloom <-
-      BF.deserialise
-        BF.BloomSize {
-          BF.sizeBits   = fromIntegral nbits,
-          BF.sizeHashes = fromIntegral nhashes
+      Bloom.deserialise
+        Bloom.BloomSize {
+          Bloom.sizeBits   = fromIntegral nbits,
+          Bloom.sizeHashes = fromIntegral nhashes
         }
         (\buf off len ->
             rethrowEOFError "bloom filter file too short" $
diff --git a/src/Database/LSMTree/Internal/BloomFilterQuery1.hs b/src/Database/LSMTree/Internal/BloomFilterQuery1.hs
deleted file mode 100644
index b442a257a..000000000
--- a/src/Database/LSMTree/Internal/BloomFilterQuery1.hs
+++ /dev/null
@@ -1,130 +0,0 @@
-{-# LANGUAGE CPP             #-}
-{-# LANGUAGE MagicHash       #-}
-{-# LANGUAGE PatternSynonyms #-}
-{-# LANGUAGE UnboxedTuples   #-}
-{-# OPTIONS_HADDOCK not-home #-}
-
-module Database.LSMTree.Internal.BloomFilterQuery1 (
-  bloomQueries,
-  RunIxKeyIx(RunIxKeyIx),
-  RunIx, KeyIx,
-) where
-
-import           Data.Bits
-import qualified Data.Primitive as P
-import qualified Data.Vector as V
-import qualified Data.Vector.Primitive as VP
-import qualified Data.Vector.Primitive.Mutable as VPM
-import           Data.Word (Word32)
-
-import           Control.Exception (assert)
-import           Control.Monad.ST (ST)
-
-import           Data.BloomFilter.Blocked (Bloom)
-import qualified Data.BloomFilter.Blocked as Bloom
-
-import           Database.LSMTree.Internal.Serialise (SerialisedKey)
-
-
--- Bulk query
------------------------------------------------------------
-
-type KeyIx = Int
-type RunIx = Int
-
--- | A 'RunIxKeyIx' is a (compact) pair of a 'RunIx' and a 'KeyIx'.
---
--- We represent it as a 32bit word, using:
---
--- * 16 bits for the run\/filter index (MSB)
--- * 16 bits for the key index (LSB)
---
-newtype RunIxKeyIx = MkRunIxKeyIx Word32
-  deriving stock Eq
-  deriving newtype P.Prim
-
-pattern RunIxKeyIx :: RunIx -> KeyIx -> RunIxKeyIx
-pattern RunIxKeyIx r k <- (unpackRunIxKeyIx -> (r, k))
-  where
-    RunIxKeyIx r k = packRunIxKeyIx r k
-{-# INLINE RunIxKeyIx #-}
-{-# COMPLETE RunIxKeyIx #-}
-
-packRunIxKeyIx :: Int -> Int -> RunIxKeyIx
-packRunIxKeyIx r k =
-    assert (r >= 0 && r <= 0xffff
-         && k >= 0 && k <= 0xffff) $
-    MkRunIxKeyIx $
-      (fromIntegral :: Word -> Word32) $
-        (fromIntegral r `unsafeShiftL` 16)
-     .|. fromIntegral k
-{-# INLINE packRunIxKeyIx #-}
-
-unpackRunIxKeyIx :: RunIxKeyIx -> (Int, Int)
-unpackRunIxKeyIx (MkRunIxKeyIx c) =
-    ( fromIntegral (c `unsafeShiftR` 16)
-    , fromIntegral (c .&. 0xfff)
-    )
-{-# INLINE unpackRunIxKeyIx #-}
-
-instance Show RunIxKeyIx where
-  showsPrec _ (RunIxKeyIx r k) =
-    showString "RunIxKeyIx " . showsPrec 11 r
-              . showChar ' ' . showsPrec 11 k
-
-type ResIx = Int -- Result index
-
--- | Perform a batch of bloom queries. The result is a tuple of indexes into the
--- vector of runs and vector of keys respectively.
---
--- The result vector can be of variable length. The initial estimate is 2x the
--- number of keys but this is grown if needed (using a doubling strategy).
---
-bloomQueries ::
-     V.Vector (Bloom SerialisedKey)
-  -> V.Vector SerialisedKey
-  -> VP.Vector RunIxKeyIx
-bloomQueries !blooms !ks
-  | rsN == 0 || ksN == 0 = VP.empty
-  | otherwise            = VP.create $ do
-      res <- VPM.unsafeNew (V.length ks * 2)
-      loop1 res 0 0
-  where
-    !rsN = V.length blooms
-    !ksN = V.length ks
-
-    hs :: VP.Vector (Bloom.Hashes SerialisedKey)
-    !hs  = VP.generate ksN $ \i -> Bloom.hashes (V.unsafeIndex ks i)
-
-    -- Loop over all run indexes
-    loop1 ::
-         VPM.MVector s RunIxKeyIx
-      -> ResIx
-      -> RunIx
-      -> ST s (VPM.MVector s RunIxKeyIx)
-    loop1 !res1 !resix1 !rix
-      | rix == rsN = pure $ VPM.slice 0 resix1 res1
-      | otherwise
-      = do
-          (res1', resix1') <- loop2 res1 resix1 0 (blooms `V.unsafeIndex` rix)
-          loop1 res1' resix1' (rix+1)
-      where
-        -- Loop over all key indexes
-        loop2 ::
-             VPM.MVector s RunIxKeyIx
-          -> ResIx
-          -> KeyIx
-          -> Bloom SerialisedKey
-          -> ST s (VPM.MVector s RunIxKeyIx, ResIx)
-        loop2 !res2 !resix2 !kix !b
-          | kix == ksN = pure (res2, resix2)
-          | let !h = hs `VP.unsafeIndex` kix
-          , Bloom.elemHashes b h = do
-              -- Double the vector if we've reached the end.
-              -- Note unsafeGrow takes the number to grow by, not the new size.
-              res2' <- if resix2 == VPM.length res2
-                        then VPM.unsafeGrow res2 (VPM.length res2)
-                        else pure res2
-              VPM.unsafeWrite res2' resix2 (RunIxKeyIx rix kix)
-              loop2 res2' (resix2+1) (kix+1) b
-          | otherwise = loop2 res2 resix2 (kix+1) b
diff --git a/src/Database/LSMTree/Internal/BloomFilterQuery2.hs b/src/Database/LSMTree/Internal/BloomFilterQuery2.hs
deleted file mode 100644
index 83f711c2e..000000000
--- a/src/Database/LSMTree/Internal/BloomFilterQuery2.hs
+++ /dev/null
@@ -1,412 +0,0 @@
-{-# LANGUAGE MagicHash           #-}
-{-# LANGUAGE PatternSynonyms     #-}
-{-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE UnboxedTuples       #-}
-{-# OPTIONS_HADDOCK not-home #-}
-
--- The use of -fregs-iterative here does a better job in the hot loop for
--- the bloomQueriesBody below. It eliminates all spilling to the stack.
--- There's just 6 stack reads in the loop now, no writes.
-{-# OPTIONS_GHC -O2 -fregs-iterative -fmax-inline-alloc-size=512 #-}
-
--- | An implementation of batched bloom filter query, optimised for memory
--- prefetch.
-module Database.LSMTree.Internal.BloomFilterQuery2 (
-  bloomQueries,
-  RunIxKeyIx(RunIxKeyIx),
-  RunIx, KeyIx,
-  -- $algorithm
-  -- * Internals exposed for tests
-  CandidateProbe (..),
-) where
-
-import           Prelude hiding (filter)
-
-import           Data.Bits
-import qualified Data.Primitive as P
-import qualified Data.Vector as V
-import qualified Data.Vector.Primitive as VP
-import           Data.Word (Word64)
-
-import           Control.Exception (assert)
-import           Control.Monad.ST (ST, runST)
-
-import           GHC.Exts (Int#, uncheckedIShiftL#, (+#))
-
-import           Data.BloomFilter (Bloom)
-import qualified Data.BloomFilter as Bloom
-import qualified Data.BloomFilter.BitVec64 as BV64
-import qualified Data.BloomFilter.Hash as Bloom
-import qualified Data.BloomFilter.Internal as BF
-import           Database.LSMTree.Internal.BloomFilterQuery1 (RunIxKeyIx (..))
-import           Database.LSMTree.Internal.Serialise (SerialisedKey)
-import qualified Database.LSMTree.Internal.StrictArray as P
-import qualified Database.LSMTree.Internal.Vector as P
-
--- Bulk query
------------------------------------------------------------
-
-type KeyIx = Int
-type RunIx = Int
-
--- $algorithm
---
--- == Key algorithm concepts
---
--- There is almost no opportunity for memory prefetching when looking up a
--- single key in a single Bloom filter. So this is a bulk algorithm, to do a
--- lot of work all in one go, which does create the opportunity to prefetch
--- memory. It also provides an opportunity to share key hashes across filters.
---
--- We have as inputs N bloom filters and M keys to look up in them. So overall
--- there are N * M independent bloom filter tests. The result is expected to be
--- sparse, with roughly M*(1+FPR) positive results. So we use a sparse
--- representation for the result: pairs of indexes identifying input Bloom
--- filters and input keys with a positive test result.
---
--- We aim for the number of memory operations in-flight simultaneously to be on
--- the order of 32. This trades-off memory parallelism with cache use. In
--- particular this means the algorithm must be able to use a fixed prefetch
--- \"distance\" rather than this being dependent on the input sizes. To achieve
--- this, we use a fixed size circular buffer (queue). The buffer size can be
--- tuned to optimise the prefetch behaviour: indeed we pick exactly 32.
---
--- == Micro optimisation
---
--- We use primitive arrays and arrays -- rather than vectors -- so as to
--- minimise the number of function variables, to be able to keep most things in
--- registers. Normal vectors use two additional variables, which triples
--- register pressure.
---
--- We use an Array of Strict BloomFilter. This avoids the test for WHNF in the
--- inner loop, which causes all registered to be spilled to and restored from
---the stack.
---
-
-type Candidate = RunIxKeyIx
-
--- | A candidate probe point is the combination of a filter\/run index,
--- key index and hash number. This combination determines the probe location
--- in the bloom filter, which is also cached.
---
--- We store these in 'PrimArray's as a pair of 64bit words:
---
--- * Low 64bit word:
---   - 16 bits padding (always 0s)    (MSB)
---   - 16 bits for the hash number
---   - 16 bits for the filter index
---   - 16 bits for the key index      (LSB)
---
--- * High 64bit word: FilterBitIx
---
-data CandidateProbe = MkCandidateProbe !Word64 !Word64
-
-type HashNo = Int
-type FilterBitIx = Int
-
-instance P.Prim CandidateProbe where
-    sizeOfType# _ = 16#
-    alignmentOfType# _ = 8#
-
-    indexByteArray# ba i =
-      MkCandidateProbe
-        (P.indexByteArray# ba (indexLo i))
-        (P.indexByteArray# ba (indexHi i))
-    readByteArray# ba i s1 =
-        case P.readByteArray# ba (indexLo i) s1 of { (# s2, lo #) ->
-        case P.readByteArray# ba (indexHi i) s2 of { (# s3, hi #) ->
-        (# s3, MkCandidateProbe lo hi #)
-        }}
-    writeByteArray# ba i (MkCandidateProbe lo hi) s =
-        P.writeByteArray# ba (indexHi i) hi
-       (P.writeByteArray# ba (indexLo i) lo s)
-
-    indexOffAddr# ba i =
-      MkCandidateProbe
-        (P.indexOffAddr# ba (indexLo i))
-        (P.indexOffAddr# ba (indexHi i))
-    readOffAddr# ba i s1 =
-        case P.readOffAddr# ba (indexLo i) s1 of { (# s2, lo #) ->
-        case P.readOffAddr# ba (indexHi i) s2 of { (# s3, hi #) ->
-        (# s3, MkCandidateProbe lo hi #)
-        }}
-    writeOffAddr# ba i (MkCandidateProbe lo hi) s =
-        P.writeOffAddr# ba (indexHi i) hi
-       (P.writeOffAddr# ba (indexLo i) lo s)
-
-indexLo :: Int# -> Int#
-indexLo i = uncheckedIShiftL# i 1#
-
-indexHi :: Int# -> Int#
-indexHi i = uncheckedIShiftL# i 1# +# 1#
-
-pattern CandidateProbe :: RunIx
-                       -> KeyIx
-                       -> HashNo
-                       -> FilterBitIx
-                       -> CandidateProbe
-pattern CandidateProbe r k h p <- (unpackCandidateProbe -> (r, k, h, p))
-  where
-    CandidateProbe r k h p = packCandidateProbe r k h p
-{-# INLINE CandidateProbe #-}
-{-# COMPLETE CandidateProbe #-}
-
-unpackCandidateProbe :: CandidateProbe -> (Int, Int, Int, Int)
-unpackCandidateProbe (MkCandidateProbe lo hi) =
-    ( fromIntegral ((lo `unsafeShiftR` 16) .&. 0xffff) -- run ix
-    , fromIntegral ( lo                    .&. 0xffff) -- key ix
-    , fromIntegral (lo `unsafeShiftR` 32)            -- hashno
-    , fromIntegral  hi
-    )
-{-# INLINE unpackCandidateProbe #-}
-
-packCandidateProbe :: Int -> Int -> Int -> Int -> CandidateProbe
-packCandidateProbe r k h p =
-    assert (r >= 0 && r <= 0xffff) $
-    assert (k >= 0 && k <= 0xffff) $
-    assert (h >= 0 && h <= 0xffff) $
-    MkCandidateProbe
-      (   fromIntegral r `unsafeShiftL` 16  -- run ix
-      .|. fromIntegral k                    -- key ix
-      .|. fromIntegral h `unsafeShiftL` 32) -- hashno
-      (fromIntegral p)
-{-# INLINE packCandidateProbe #-}
-
-instance Show CandidateProbe where
-  showsPrec _ (CandidateProbe r k h p) =
-      showString "CandidateProbe "
-    . showsPrec 11 r
-    . showChar ' '
-    . showsPrec 11 k
-    . showChar ' '
-    . showsPrec 11 h
-    . showChar ' '
-    . showsPrec 11 p
-
-
-
-bloomQueries ::
-     V.Vector (Bloom SerialisedKey)
-  -> V.Vector SerialisedKey
-  -> VP.Vector RunIxKeyIx
-bloomQueries filters keys | V.null filters || V.null keys
-                          = VP.empty
-bloomQueries filters keys =
-  assert (all BF.bloomInvariant filters) $
-  -- in particular the bloomInvariant checks the size is > 0
-  runST $ do
-    let !keyhashes = prepKeyHashes keys
-        !filters'  = P.vectorToStrictArray filters
-    candidateProbes <- P.newPrimArray 0x20
-    -- We deliberately do not clear this array as it will get overwritten.
-    -- But for debugging, it's less confusing to initialise to all zeros.
-    -- To do that, uncomment the following line:
-    --P.setPrimArray 0 0x20 candidateProbes (MkCandidateProbe 0 0)
-
-    let i_r = 0
-    (i_w, nextCandidate) <-
-      prepInitialCandidateProbes
-        filters' keyhashes
-        candidateProbes
-        0 (RunIxKeyIx 0 0)
-
-    output <- bloomQueriesBody filters' keyhashes candidateProbes
-                               i_r i_w nextCandidate
-    return $! P.primArrayToPrimVector output
-
-prepKeyHashes :: V.Vector SerialisedKey
-              -> P.PrimArray (Bloom.CheapHashes SerialisedKey)
-prepKeyHashes keys =
-    P.generatePrimArray (V.length keys) $ \i ->
-      Bloom.makeHashes (V.unsafeIndex keys i)
-
-prepInitialCandidateProbes ::
-     P.StrictArray (Bloom SerialisedKey)
-  -> P.PrimArray (Bloom.CheapHashes SerialisedKey)
-     -- ^ The pre-computed \"cheap hashes\" of the keys.
-  -> P.MutablePrimArray s CandidateProbe
-     -- ^ The array of candidates: of FilterIx, KeyIx and HashNo. This is
-     -- used as a fixed size rolling buffer.
-  -> Int
-     -- ^ The write index within the rolling buffer. This wraps around.
-  -> RunIxKeyIx
-     -- ^ The last candidate added to the rolling buffer.
-  -> ST s (Int,  RunIxKeyIx)
-prepInitialCandidateProbes
-    filters keyhashes
-    candidateProbes i_w
-    nextCandidate@(RunIxKeyIx rix kix)
-
-  | i_w < 0x1f && rix < P.sizeofStrictArray filters = do
-    -- We prepare at most 31 (not 32) candidates, in the buffer of size 32.
-    -- This is because we define the buffer to be empty when i_r == i_w. Hence
-    -- we cannot fill all entries in the array.
-
-    let !filter  = P.indexStrictArray filters rix
-        !keyhash = P.indexPrimArray keyhashes kix
-        !hn      = BF.hashesN filter - 1
-        !bix     = (fromIntegral :: Word64 -> Int) $
-                   Bloom.evalHashes keyhash hn
-                     `BV64.unsafeRemWord64` -- size must be > 0
-                   BF.size filter           -- bloomInvariant ensures this
-    BV64.prefetchIndex (BF.bitArray filter) bix
-    assert ((rix, kix, hn, bix) == unpackCandidateProbe (CandidateProbe rix kix hn bix)) $ return ()
-    P.writePrimArray candidateProbes i_w (CandidateProbe rix kix hn bix)
-    prepInitialCandidateProbes
-      filters keyhashes
-      candidateProbes
-      (i_w+1)
-      (succCandidate keyhashes nextCandidate)
-
-    -- Filled the buffer or ran out of candidates
-  | otherwise = return (i_w, nextCandidate)
-
-
-{-# NOINLINE bloomQueriesBody #-}
-bloomQueriesBody ::
-     forall s.
-     P.StrictArray (Bloom SerialisedKey)
-  -> P.PrimArray (Bloom.CheapHashes SerialisedKey)
-     -- ^ The pre-computed \"cheap hashes\" of the keys.
-  -> P.MutablePrimArray s CandidateProbe
-     -- ^ The array of candidates: of FilterIx, KeyIx, HashNo and FilterBitIx.
-     -- This is used as a fixed size rolling buffer. It is used to communicate
-     -- between iterations. On each iteration it is read to do the (already
-     -- prefetched) memory reads. And it is written with the prefetched bit indexes for the next iteration.
-  -> Int -> Int -> RunIxKeyIx
-  -> ST s (P.PrimArray RunIxKeyIx)
-bloomQueriesBody !filters !keyhashes !candidateProbes =
-   \ !i_r !i_w !nextCandidate -> do
-    output <- P.newPrimArray (P.sizeofPrimArray keyhashes * 2)
-    output' <-
-      testCandidateProbe
-        i_r i_w nextCandidate
-        output 0
-    P.unsafeFreezePrimArray output'
-  where
-    {-# INLINE prepGivenCandidateProbe #-}
-
-    -- assume buff size of 0x20, so mask of 0x1f
-    testCandidateProbe, prepNextCandidateProbe ::
-         Int -> Int
-         -- ^ The read and write indexes within the rolling buffer. These wrap
-         -- around.
-      -> RunIxKeyIx
-         -- ^ The run and key index of the next candidate add to the rolling buffer.
-         -- When this passes the maximum then no more candidates are added, and we
-         -- just drain the buffer.
-      -> P.MutablePrimArray s RunIxKeyIx
-         -- ^ The output array.
-      -> Int
-         -- ^ The output array next free index.
-      -> ST s (P.MutablePrimArray s RunIxKeyIx)
-
-    testCandidateProbe !i_r !i_w
-                       !nextCandidate
-                       !output !outputix =
-      assert (i_r >= 0 && i_r <= 0x1f) $
-      assert (i_w >= 0 && i_w <= 0x1f) $ do
-        -- A candidate probe is the combination of a filter, key, hash no and
-        -- the corresponding probe location in the bloom filter. When a candidate
-        -- probe is prepared, its probe location is computed and prefetched.
-        candidateProbe <- P.readPrimArray candidateProbes i_r
-        let CandidateProbe rix kix hn bix = candidateProbe
-
-        -- Now test the actual Bloom filter bit to see if it's set or not.
-        let filter = P.indexStrictArray filters rix
-        if BV64.unsafeIndex (BF.bitArray filter) bix
-
-          -- The Bloom filter bit is set. Now we either keep this candidate but
-          -- probe the next hash number, or if the hash number reached zero then
-          -- this candidate (run index, key index) pair goes into the output, and
-          -- we prepare the next candidate.
-          then
-            if hn == 0
-              then do
-                -- Output the candidate
-                P.writePrimArray output outputix (RunIxKeyIx rix kix)
-                outputsz <- P.getSizeofMutablePrimArray output
-                output'  <- if outputix+1 < outputsz
-                              then return output
-                              else P.resizeMutablePrimArray output (outputsz * 2)
-                prepNextCandidateProbe
-                  i_r i_w nextCandidate
-                  output' (outputix+1)
-
-              else do
-                -- The hashno has not reached zero, so we prepare a new candidate
-                -- with the same (filter,key) pair, but with the next hashno.
-                prepGivenCandidateProbe
-                  i_r i_w nextCandidate
-                  output outputix
-                  rix kix (hn-1) filter
-
-          -- The Bloom filter bit is not set, so abandon this filter,key pair and
-          -- prepare the next candidate.
-          else prepNextCandidateProbe
-                 i_r i_w nextCandidate
-                 output outputix
-
-    prepNextCandidateProbe !i_r !i_w nextCandidate@(RunIxKeyIx rix kix)
-                           !output !outputix
-        -- There is a next candidate. Write it into the rolling buffer at the write
-        -- index, and continue with an incremented buffer read and write index.
-      | rix < P.sizeofStrictArray filters =
-        let filter         = P.indexStrictArray filters rix
-            hn             = BF.hashesN filter - 1
-            nextCandidate' = succCandidate keyhashes nextCandidate
-         in prepGivenCandidateProbe
-              i_r i_w nextCandidate'
-              output outputix
-              rix kix hn filter
-
-        -- There is no next candidate but the candidate buffer is non-empty.
-        -- Don't write into the candidate buffer. Continue with incrementing the
-        -- buffer read pointer but not the write pointer.
-      | ((i_r + 1) .&. 0x1f) /= i_w =
-          testCandidateProbe
-            ((i_r + 1) .&. 0x1f)
-              i_w
-            nextCandidate
-            output outputix
-
-        -- There is no next candidate and the candidate buffer is empty.
-        -- We're done!
-      | otherwise =
-          P.resizeMutablePrimArray output outputix
-
-    prepGivenCandidateProbe ::
-         Int -> Int
-      -> RunIxKeyIx
-      -> P.MutablePrimArray s RunIxKeyIx
-      -> Int
-      -> Int -> Int -> Int
-      -> Bloom.Bloom SerialisedKey
-      -> ST s (P.MutablePrimArray s RunIxKeyIx)
-    prepGivenCandidateProbe !i_r !i_w
-                            !nextCandidate
-                            !output !outputix
-                            !rix !kix !hn !filter =
-      assert (hn >= 0 && hn < BF.hashesN filter) $ do
-        let !keyhash = P.indexPrimArray keyhashes kix
-            !bix     = (fromIntegral :: Word64 -> Int) $
-                       Bloom.evalHashes keyhash hn
-                         `BV64.unsafeRemWord64` -- size must be > 0
-                       BF.size filter           -- bloomInvariant ensures this
-        BV64.prefetchIndex (BF.bitArray filter) bix
-        P.writePrimArray candidateProbes i_w (CandidateProbe rix kix hn bix)
-        testCandidateProbe
-            ((i_r + 1) .&. 0x1f)
-            ((i_w + 1) .&. 0x1f)
-            -- or if we merge them: ((i_rw + 0x0101) .&. 0x1f1f)
-            nextCandidate
-            output outputix
-
-succCandidate :: P.PrimArray (Bloom.CheapHashes SerialisedKey)
-              -> Candidate
-              -> Candidate
-succCandidate keyhashes (RunIxKeyIx rix kix)
-  | kix+1 < P.sizeofPrimArray keyhashes = RunIxKeyIx rix (kix+1)
-  | otherwise                           = RunIxKeyIx (rix+1) 0
-  --TODO: optimise with bit twiddling
-
diff --git a/src/Database/LSMTree/Internal/Lookup.hs b/src/Database/LSMTree/Internal/Lookup.hs
index 146532e00..c546c02c7 100644
--- a/src/Database/LSMTree/Internal/Lookup.hs
+++ b/src/Database/LSMTree/Internal/Lookup.hs
@@ -36,7 +36,8 @@ import           Control.Monad.ST.Strict
 import           Control.RefCount
 
 import           Database.LSMTree.Internal.BlobRef (WeakBlobRef (..))
-import           Database.LSMTree.Internal.BloomFilter (Bloom)
+import           Database.LSMTree.Internal.BloomFilter (Bloom, RunIxKeyIx (..),
+                     bloomQueries)
 import           Database.LSMTree.Internal.Entry
 import           Database.LSMTree.Internal.Index (Index)
 import qualified Database.LSMTree.Internal.Index as Index (search)
@@ -54,14 +55,6 @@ import qualified Database.LSMTree.Internal.WriteBufferBlobs as WBB
 import           System.FS.API (BufferOffset (..), Handle)
 import           System.FS.BlockIO.API
 
-#ifdef BLOOM_QUERY_FAST
-import           Database.LSMTree.Internal.BloomFilterQuery2 (RunIxKeyIx (..),
-                     bloomQueries)
-#else
-import           Database.LSMTree.Internal.BloomFilterQuery1 (RunIxKeyIx (..),
-                     bloomQueries)
-#endif
-
 -- | Prepare disk lookups by doing bloom filter queries, index searches and
 -- creating 'IOOp's. The result is a vector of 'IOOp's and a vector of indexes,
 -- both of which are the same length. The indexes record the run and key
diff --git a/src/Database/LSMTree/Internal/RunAcc.hs b/src/Database/LSMTree/Internal/RunAcc.hs
index 4d88a1c8a..ee68811e2 100644
--- a/src/Database/LSMTree/Internal/RunAcc.hs
+++ b/src/Database/LSMTree/Internal/RunAcc.hs
@@ -162,10 +162,11 @@ addSmallKeyOp ::
   -> SerialisedKey
   -> Entry SerialisedValue BlobSpan
   -> ST s (Maybe (RawPage, Maybe Chunk))
-addSmallKeyOp racc@RunAcc{..} k e =
+addSmallKeyOp racc@RunAcc{entryCount , mpageacc} k e =
   assert (PageAcc.entryWouldFitInPage k e) $ do
     modifyPrimVar entryCount (+1)
-    Bloom.insert mbloom k
+    -- We do not use Bloom.insert here. We accumulate keys (in the mpageacc)
+    -- and add them all to the mbloom in a batch in flushPageIfNonEmpty.
 
     pageBoundaryNeeded <-
         -- Try adding the key/op to the page accumulator to see if it fits. If
@@ -208,9 +209,11 @@ addLargeKeyOp ::
   -> SerialisedKey
   -> Entry SerialisedValue BlobSpan -- ^ the full value, not just a prefix
   -> ST s ([RawPage], [RawOverflowPage], [Chunk])
-addLargeKeyOp racc@RunAcc{..} k e =
+addLargeKeyOp racc@RunAcc{entryCount, mindex, mbloom} k e =
   assert (not (PageAcc.entryWouldFitInPage k e)) $ do
     modifyPrimVar entryCount (+1)
+    -- Large key/op pairs don't get added to the mpageacc, and thus bypass the
+    -- bulk bloom insert in flushPageIfNonEmpty, so we add them directly here.
     Bloom.insert mbloom k
 
     -- If the existing page accumulator is non-empty, we flush it, since the
@@ -259,12 +262,15 @@ addLargeSerialisedKeyOp ::
                        -- key\/op /without/ a 'BlobSpan'.
   -> [RawOverflowPage] -- ^ The overflow pages for this key\/op
   -> ST s ([RawPage], [RawOverflowPage], [Chunk])
-addLargeSerialisedKeyOp racc@RunAcc{..} k page overflowPages =
+addLargeSerialisedKeyOp racc@RunAcc{entryCount, mindex, mbloom}
+                        k page overflowPages =
   assert (RawPage.rawPageNumKeys page == 1) $
   assert (RawPage.rawPageHasBlobSpanAt page 0 == 0) $
   assert (RawPage.rawPageOverflowPages page > 0) $
   assert (RawPage.rawPageOverflowPages page == length overflowPages) $ do
     modifyPrimVar entryCount (+1)
+    -- Large key/op pairs don't get added to the mpageacc, and thus bypass the
+    -- bulk bloom insert in flushPageIfNonEmpty, so we add them directly here.
     Bloom.insert mbloom k
 
     -- If the existing page accumulator is non-empty, we flush it, since the
@@ -282,10 +288,12 @@ addLargeSerialisedKeyOp racc@RunAcc{..} k page overflowPages =
 -- Returns @Nothing@ if the page accumulator was empty.
 --
 flushPageIfNonEmpty :: RunAcc s -> ST s (Maybe (RawPage, Maybe Chunk))
-flushPageIfNonEmpty RunAcc{mpageacc, mindex} = do
+flushPageIfNonEmpty RunAcc{mpageacc, mindex, mbloom} = do
     nkeys <- PageAcc.keysCountPageAcc mpageacc
     if nkeys > 0
       then do
+        bloomInserts mbloom mpageacc nkeys
+
         -- Grab the min and max keys, and add the page to the index.
         minKey <- PageAcc.indexKeyPageAcc mpageacc 0
         maxKey <- PageAcc.indexKeyPageAcc mpageacc (nkeys-1)
@@ -298,6 +306,14 @@ flushPageIfNonEmpty RunAcc{mpageacc, mindex} = do
 
       else pure Nothing
 
+-- An instance of insertMany specialised to SerialisedKey and indexKeyPageAcc.
+-- This is a performance-sensitive function. It is marked NOINLINE so we can
+-- easily inspect the core and check all the specialisations worked as expected.
+{-# NOINLINE bloomInserts #-}
+bloomInserts :: MBloom s SerialisedKey -> PageAcc s -> Int -> ST s ()
+bloomInserts !mbloom !mpageacc !nkeys =
+    Bloom.insertMany mbloom (PageAcc.indexKeyPageAcc mpageacc) nkeys
+
 -- | Internal helper for 'addLargeKeyOp' and 'addLargeSerialisedKeyOp'.
 -- Combine the result of 'flushPageIfNonEmpty' with extra pages and index
 -- chunks.
diff --git a/src/Database/LSMTree/Internal/StrictArray.hs b/src/Database/LSMTree/Internal/StrictArray.hs
deleted file mode 100644
index a6534d27c..000000000
--- a/src/Database/LSMTree/Internal/StrictArray.hs
+++ /dev/null
@@ -1,60 +0,0 @@
-{-# LANGUAGE MagicHash           #-}
-{-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE TypeApplications    #-}
-{-# LANGUAGE UnboxedTuples       #-}
-{-# OPTIONS_HADDOCK not-home #-}
-
-module Database.LSMTree.Internal.StrictArray (
-    StrictArray,
-    vectorToStrictArray,
-    indexStrictArray,
-    sizeofStrictArray,
-) where
-
-import           Data.Elevator (Strict (Strict))
-import qualified Data.Vector as V
-import           GHC.Exts ((+#))
-import qualified GHC.Exts as GHC
-import           GHC.ST (ST (ST), runST)
-import           Unsafe.Coerce (unsafeCoerce)
-
-data StrictArray a = StrictArray !(GHC.Array# (Strict a))
-
-vectorToStrictArray :: forall a. V.Vector a -> StrictArray a
-vectorToStrictArray v =
-    runST $ ST $ \s0 ->
-    case GHC.newArray# (case V.length v of GHC.I# l# -> l#)
-                       (Strict (unsafeCoerce ())) s0 of
-                       -- initialise with (), will be overwritten.
-      (# s1, a#  #) ->
-        case go a# 0# s1 of
-          s2 -> case GHC.unsafeFreezeArray# a# s2 of
-                  (# s3, a'# #) -> (# s3, StrictArray a'# #)
-  where
-    go :: forall s.
-          GHC.MutableArray# s (Strict a)
-       -> GHC.Int#
-       -> GHC.State# s
-       -> GHC.State# s
-    go a# i# s
-      | GHC.I# i# < V.length v
-      = let x = V.unsafeIndex v (GHC.I# i#)
-            -- We have to use seq# here to force the array element to WHNF
-            -- before putting it into the strict array. This should not be
-            -- necessary. https://github.com/sgraf812/data-elevator/issues/4
-         in case GHC.seq# x s of
-              (# s', x' #) ->
-                case GHC.writeArray# a# i# (Strict x') s' of
-                  s'' -> go a# (i# +# 1#) s''
-      | otherwise = s
-
-{-# INLINE indexStrictArray #-}
-indexStrictArray :: StrictArray a -> Int -> a
-indexStrictArray (StrictArray a#) (GHC.I# i#) =
-    case GHC.indexArray# a# i# of
-      (# Strict x #) -> x
-
-{-# INLINE sizeofStrictArray #-}
-sizeofStrictArray :: StrictArray a -> Int
-sizeofStrictArray (StrictArray a#) =
-    GHC.I# (GHC.sizeofArray# a#)
diff --git a/test/Test/Database/LSMTree/Internal/BloomFilter.hs b/test/Test/Database/LSMTree/Internal/BloomFilter.hs
index bd19ab724..22041d10d 100644
--- a/test/Test/Database/LSMTree/Internal/BloomFilter.hs
+++ b/test/Test/Database/LSMTree/Internal/BloomFilter.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP #-}
-{-# OPTIONS_GHC -Wno-orphans #-}
 module Test.Database.LSMTree.Internal.BloomFilter (tests) where
 
 import           Control.DeepSeq (deepseq)
@@ -25,18 +23,11 @@ import           Test.QuickCheck.Instances ()
 import           Test.Tasty (TestTree, testGroup)
 import           Test.Tasty.QuickCheck hiding ((.&.))
 
-import qualified Data.BloomFilter.Blocked as BF
+import qualified Data.BloomFilter.Blocked as Bloom
 import           Database.LSMTree.Internal.BloomFilter
-import qualified Database.LSMTree.Internal.BloomFilterQuery1 as Bloom1
 import           Database.LSMTree.Internal.Serialise (SerialisedKey,
                      serialiseKey)
 
-#ifdef BLOOM_QUERY_FAST
-import qualified Database.LSMTree.Internal.BloomFilterQuery2 as Bloom2
-import           Test.QuickCheck.Classes (primLaws)
-import           Test.Util.QC
-#endif
-
 --TODO: add a golden test for the BloomFilter format vs the 'formatVersion'
 -- to ensure we don't change the format without conciously bumping the version.
 tests :: TestTree
@@ -50,11 +41,6 @@ tests = testGroup "Database.LSMTree.Internal.BloomFilter"
         prop_total_deserialisation_whitebox
     , testProperty "bloomQueries (bulk)" $
         prop_bloomQueries1
-#ifdef BLOOM_QUERY_FAST
-    , testClassLaws "CandidateProbe" (primLaws (Proxy :: Proxy Bloom2.CandidateProbe))
-    , testProperty "bloomQueries (bulk, prefetching)" $
-        prop_bloomQueries2
-#endif
     ]
 
 roundtrip_prop :: Positive (Small Int) -> Positive Int ->  [Word64] -> Property
@@ -64,9 +50,9 @@ roundtrip_prop (Positive (Small hfN)) (Positive bits) ws =
       Left  err -> counterexample (displayException err) $ property False
       Right rhs -> lhs === rhs
   where
-    sz  = BF.BloomSize { sizeBits   = limitBits bits,
+    sz  = Bloom.BloomSize { sizeBits   = limitBits bits,
                          sizeHashes = hfN }
-    lhs = BF.create sz (\b -> mapM_ (BF.insert b) ws)
+    lhs = Bloom.create sz (\b -> mapM_ (Bloom.insert b) ws)
     bs  = LBS.toStrict (bloomFilterToLBS lhs)
 
 limitBits :: Int -> Int
@@ -81,7 +67,7 @@ prop_total_deserialisation bs =
 
 -- | Write the bytestring to a file in the mock file system and then use
 -- 'bloomFilterFromFile'.
-bloomFilterFromBS :: BS.ByteString -> Either IOSim.Failure (BF.Bloom a)
+bloomFilterFromBS :: BS.ByteString -> Either IOSim.Failure (Bloom a)
 bloomFilterFromBS bs =
     IOSim.runSim $ do
       hfs <- FSSim.simHasFS' MockFS.empty
@@ -129,8 +115,9 @@ prop_bloomQueries1 :: FPR
                    -> [Small Word64]
                    -> Property
 prop_bloomQueries1 (FPR fpr) filters keys =
-    let filters' :: [BF.Bloom SerialisedKey]
-        filters' = map (BF.fromList (BF.policyForFPR fpr) . map (\(Small k) -> serialiseKey k))
+    let filters' :: [Bloom SerialisedKey]
+        filters' = map (Bloom.fromList (Bloom.policyForFPR fpr)
+                        . map (\(Small k) -> serialiseKey k))
                        filters
 
         keys' :: [SerialisedKey]
@@ -141,12 +128,12 @@ prop_bloomQueries1 (FPR fpr) filters keys =
           [ (f_i, k_i)
           | (f, f_i) <- zip filters' [0..]
           , (k, k_i) <- zip keys' [0..]
-          , BF.elem k f
+          , Bloom.elem k f
           ]
 
         filterSets = map (Set.fromList . map (\(Small k) -> serialiseKey k)) filters
         referenceCmp =
-          [ (BF.elem k f, k `Set.member` f')
+          [ (Bloom.elem k f, k `Set.member` f')
           | (f, f') <- zip filters' filterSets
           , k       <- keys'
           ]
@@ -157,51 +144,13 @@ prop_bloomQueries1 (FPR fpr) filters keys =
         distribution   = truePositives ++ falsePositives
                       ++ trueNegatives ++ falseNegatives
 
-    -- To get coverage of Bloom1.bloomQueries array resizing we want some
+    -- To get coverage of bloomQueries array resizing we want some
     -- cases with high FPRs.
      in tabulate "FPR" [show (round (fpr * 10) * 10 :: Int) ++ "%"] $
         coverTable "FPR" [("100%", 5)] $
         tabulate "distribution of true/false positives/negatives" distribution $
         referenceResults
        ===
-        map (\(Bloom1.RunIxKeyIx rix kix) -> (rix, kix))
-            (VP.toList (Bloom1.bloomQueries (V.fromList filters')
-                                            (V.fromList keys')))
-
-#ifdef BLOOM_QUERY_FAST
-prop_bloomQueries2 :: FPR
-                   -> [[Small Word64]]
-                   -> [Small Word64]
-                   -> Property
-prop_bloomQueries2 (FPR fpr) filters keys =
-    let filters' :: [BF.Bloom SerialisedKey]
-        filters' = map (BF.fromList (BF.policyForFPR fpr) .
-                        map (\(Small k) -> serialiseKey k)) filters
-
-        keys' :: [SerialisedKey]
-        keys' = map (\(Small k) -> serialiseKey k) keys
-
-        referenceResults :: [(Int, Int)]
-        referenceResults =
-          [ (f_i, k_i)
-          | (f, f_i) <- zip filters' [0..]
-          , (k, k_i) <- zip keys' [0..]
-          , BF.elem k f
-          ]
-
-    -- To get coverage of Bloom2.bloomQueries array resizing we want some
-    -- cases with high FPRs.
-     in tabulate "FPR" [show (round (fpr * 10) * 10 :: Int) ++ "%"] $
-        coverTable "FPR" [("100%", 5)] $
-
-        referenceResults
-       ===
-        map (\(Bloom2.RunIxKeyIx rix kix) -> (rix, kix))
-            (VP.toList (Bloom2.bloomQueries (V.fromList filters')
-                                            (V.fromList keys')))
-
-instance Arbitrary Bloom2.CandidateProbe where
-  arbitrary = Bloom2.MkCandidateProbe <$> arbitrary <*> arbitrary
-
-deriving stock instance Eq Bloom2.CandidateProbe
-#endif
+        map (\(RunIxKeyIx rix kix) -> (rix, kix))
+            (VP.toList (bloomQueries (V.fromList filters')
+                                     (V.fromList keys')))
diff --git a/test/Test/Database/LSMTree/Internal/Lookup.hs b/test/Test/Database/LSMTree/Internal/Lookup.hs
index 26d39c13a..78688a33c 100644
--- a/test/Test/Database/LSMTree/Internal/Lookup.hs
+++ b/test/Test/Database/LSMTree/Internal/Lookup.hs
@@ -34,6 +34,7 @@ import           Data.Map.Strict (Map)
 import qualified Data.Map.Strict as Map
 import           Data.Maybe
 import           Data.Monoid (Endo (..))
+import           Data.Set (Set)
 import qualified Data.Set as Set
 import qualified Data.Vector as V
 import qualified Data.Vector.Primitive as VP
@@ -130,21 +131,28 @@ prop_bloomQueriesModel ::
      SmallList (InMemLookupData SerialisedKey SerialisedValue BlobSpan)
   -> Property
 prop_bloomQueriesModel dats =
-    real === model
+    -- The model never returns false positives, but the real bloom filter does,
+    -- so the model result should be a subsequence of the real result.
+    counterexample (show model ++ " is not a subsequence of " ++ show real) $
+    property (model `List.isSubsequenceOf` real)
   where
-    runs = getSmallList $ fmap (mkTestRun . runData) dats
+    runDatas = getSmallList $ fmap runData dats
+    runs = fmap mkTestRun runDatas
     blooms = fmap snd3 runs
     lookupss = concatMap lookups $ getSmallList dats
     real  = map (\(RunIxKeyIx rix kix) -> (rix,kix)) $ VP.toList $
             bloomQueries (V.fromList blooms) (V.fromList lookupss)
-    model = bloomQueriesModel blooms lookupss
+    model = bloomQueriesModel (fmap Map.keysSet runDatas) lookupss
 
-bloomQueriesModel :: [Bloom SerialisedKey] -> [SerialisedKey] -> [(RunIx, KeyIx)]
+-- | A bloom filter is a probablistic set that can return false positives, but
+-- not false negatives. The simplest model of a bloom filter is therefore a
+-- non-probablistic set: a set that only returns true positives or negatives.
+bloomQueriesModel :: [Set SerialisedKey] -> [SerialisedKey] -> [(RunIx, KeyIx)]
 bloomQueriesModel blooms ks = [
       (rix, kix)
     | (rix, b) <- rs'
     , (kix, k) <- ks'
-    , Bloom.elem k b
+    , Set.member k b
     ]
   where
     rs' = zip [0..] blooms