diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4543885..1e8ad36 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,9 +2,9 @@ name: CI
 
 on:
   push:
-    branches: [ "develop" ]
-  pull_request: { }
-  workflow_dispatch: { }
+    branches: ["develop"]
+  pull_request: {}
+  workflow_dispatch: {}
 
 permissions:
   actions: read
@@ -12,7 +12,7 @@ permissions:
 
 jobs:
   build:
-    name: 'build'
+    name: "build"
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -40,7 +40,6 @@ jobs:
         uses: mozilla-actions/sccache-action@v0.0.5
       - name: Rust Compile Cache Config
         shell: bash
-        # echo "CARGO_LOG=cargo::core::compiler::fingerprint=info" >> $GITHUB_ENV
         run: |
           echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
           echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV
diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml
index e767d0d..5d30bcf 100644
--- a/.github/workflows/release-plz.yml
+++ b/.github/workflows/release-plz.yml
@@ -4,7 +4,6 @@ permissions:
   pull-requests: write
   contents: write
 
-# TODO(aduffy): uncomment when we're ready to publish
 on:
   push:
     branches:
diff --git a/benches/compress.rs b/benches/compress.rs
index 97f8c76..c9ff5af 100644
--- a/benches/compress.rs
+++ b/benches/compress.rs
@@ -1,12 +1,8 @@
-//! Compression benchmark.
-//!
-//! Contains benchmarks for FSST compression, decompression, and symbol table training.
-//!
-//! Also contains LZ4 baseline.
+//! Benchmarks for FSST compression, decompression, and symbol table training.
 #![allow(missing_docs)]
 use core::str;
 
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 
 use fsst::{Compressor, ESCAPE_CODE};
 
@@ -34,13 +30,23 @@ fn bench_fsst(c: &mut Criterion) {
     let decompressor = compressor.decompressor();
     let decompressed = decompressor.decompress(&compressed);
     let decompressed = str::from_utf8(&decompressed).unwrap();
-    println!("DECODED: {}", decompressed);
-    assert_eq!(decompressed, TEST);
 
+    group.throughput(Throughput::Elements(1));
+    group.bench_function("compress-word", |b| {
+        let mut out = vec![0u8; 8];
+        let out_ptr = out.as_mut_ptr();
+        let front = &TEST.as_bytes()[0..8];
+        let word = u64::from_le_bytes(front.try_into().unwrap());
+
+        b.iter(|| black_box(unsafe { compressor.compress_word(word, out_ptr) }));
+    });
+
+    group.throughput(Throughput::Bytes(CORPUS.len() as u64));
     group.bench_function("compress-single", |b| {
-        b.iter(|| black_box(compressor.compress(black_box(plaintext))));
+        b.iter(|| black_box(compressor.compress(black_box(CORPUS.as_bytes()))));
     });
 
+    group.throughput(Throughput::Bytes(decompressed.len() as u64));
     group.bench_function("decompress-single", |b| {
         b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
     });
diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock
index 354130a..8c0cc9f 100644
--- a/fuzz/Cargo.lock
+++ b/fuzz/Cargo.lock
@@ -21,7 +21,7 @@ dependencies = [
 
 [[package]]
 name = "fsst-rs"
-version = "0.0.1"
+version = "0.1.0"
 
 [[package]]
 name = "fsst-rs-fuzz"
diff --git a/src/builder.rs b/src/builder.rs
index 84ed370..c3272ae 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -7,8 +7,7 @@
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 
-use crate::find_longest::FindLongestSymbol;
-use crate::{Compressor, Symbol, MAX_CODE};
+use crate::{Compressor, Symbol, ESCAPE_CODE, MAX_CODE};
 
 #[derive(Debug, Clone)]
 struct Counter {
@@ -16,14 +15,29 @@ struct Counter {
     counts1: Vec<usize>,
 
     /// Frequency count for each code-pair.
-    counts2: Vec<Vec<usize>>,
+    counts2: Vec<usize>,
 }
 
+const COUNTS1_SIZE: usize = MAX_CODE as usize;
+// NOTE: in Rust, creating a 1D vector of length N^2 is ~4x faster than creating a 2-D vector,
+//  because `vec!` has a specialization for zero.
+const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE;
+
 impl Counter {
     fn new() -> Self {
         Self {
-            counts1: vec![0; MAX_CODE as usize],
-            counts2: vec![vec![0; MAX_CODE as usize]; MAX_CODE as usize],
+            counts1: vec![0; COUNTS1_SIZE],
+            counts2: vec![0; COUNTS2_SIZE],
+        }
+    }
+
+    /// reset
+    pub fn reset(&mut self) {
+        for idx in 0..COUNTS1_SIZE {
+            self.counts1[idx] = 0;
+        }
+        for idx in 0..COUNTS2_SIZE {
+            self.counts2[idx] = 0;
         }
     }
 
@@ -34,7 +48,8 @@ impl Counter {
 
     #[inline]
     fn record_count2(&mut self, code1: u16, code2: u16) {
-        self.counts2[code1 as usize][code2 as usize] += 1;
+        let idx = (code1 as usize) * 511 + (code2 as usize);
+        self.counts2[idx] += 1;
     }
 
     #[inline]
@@ -44,14 +59,15 @@ impl Counter {
 
     #[inline]
     fn count2(&self, code1: u16, code2: u16) -> usize {
-        self.counts2[code1 as usize][code2 as usize]
+        let idx = (code1 as usize) * 511 + (code2 as usize);
+        self.counts2[idx]
     }
 }
 
 /// The number of generations used for training. This is taken from the [FSST paper].
 ///
 /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
-pub const MAX_GENERATIONS: usize = 5;
+const MAX_GENERATIONS: usize = 5;
 
 impl Compressor {
     /// Build and train a `Compressor` from a sample corpus of text.
@@ -70,69 +86,92 @@ impl Compressor {
         if sample.is_empty() {
             return compressor;
         }
-        for _generation in 0..MAX_GENERATIONS {
-            let counter = compressor.compress_count(sample);
-            compressor = compressor.optimize(counter);
+
+        let mut counter = Counter::new();
+
+        for _generation in 0..(MAX_GENERATIONS - 1) {
+            compressor.compress_count(sample, &mut counter);
+            compressor = compressor.optimize(&counter, true);
+            counter.reset();
         }
 
-        compressor
+        compressor.compress_count(sample, &mut counter);
+        compressor.optimize(&counter, true)
     }
 }
 
 impl Compressor {
     /// Compress the text using the current symbol table. Count the code occurrences
     /// and code-pair occurrences to allow us to calculate apparent gain.
-    fn compress_count(&self, sample: &[u8]) -> Counter {
-        let mut counter = Counter::new();
-        let len = sample.len();
-        let mut prev_code = self.find_longest_symbol(sample);
-        counter.record_count1(prev_code);
-        let mut pos = self.symbols[prev_code as usize].len();
+    fn compress_count(&self, sample: &[u8], counter: &mut Counter) {
+        let compressed = self.compress(sample);
+        let len = compressed.len();
+
+        if len == 0 {
+            return;
+        }
+
+        fn next_code(pos: usize, compressed: &[u8]) -> (u16, usize) {
+            if compressed[pos] == ESCAPE_CODE {
+                (compressed[pos + 1] as u16, 2)
+            } else {
+                (256 + compressed[pos] as u16, 1)
+            }
+        }
+
+        // Get first code, record count
+        let (code, pos) = next_code(0, &compressed);
+        counter.record_count1(code);
+
+        let mut pos = pos;
+        let mut prev_code = code;
 
         while pos < len {
-            let code = self.find_longest_symbol(&sample[pos..len]);
+            let (code, advance) = next_code(pos, &compressed);
+            pos += advance;
+
             counter.record_count1(code);
             counter.record_count2(prev_code, code);
-            pos += self.symbols[code as usize].len();
+
             prev_code = code;
         }
-
-        counter
     }
 
     /// Using a set of counters and the existing set of symbols, build a new
     /// set of symbols/codes that optimizes the gain over the distribution in `counter`.
-    fn optimize(&self, counters: Counter) -> Self {
+    fn optimize(&self, counters: &Counter, include_ascii: bool) -> Self {
         let mut res = Compressor::default();
-        let mut pqueue = BinaryHeap::new();
+        let mut pqueue = BinaryHeap::with_capacity(65_536);
         for code1 in 0u16..(256u16 + self.n_symbols as u16) {
             let symbol1 = self.symbols[code1 as usize];
-            let gain = counters.count1(code1) * symbol1.len();
-            pqueue.push(Candidate {
-                symbol: symbol1,
-                gain,
-            });
+            let mut gain = counters.count1(code1) * symbol1.len();
+            // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols.
+            // This helps to reduce exception counts.
+            if code1 < 256 {
+                gain *= 8;
+            }
+            if gain > 0 {
+                pqueue.push(Candidate {
+                    symbol: symbol1,
+                    gain,
+                });
+            }
 
             for code2 in 0u16..(256u16 + self.n_symbols as u16) {
                 let symbol2 = &self.symbols[code2 as usize];
                 // If either symbol is zero-length, or if merging would yield a symbol of
                 // length greater than 8, skip.
-                if symbol1.len() + symbol2.len() >= 8 {
+                if symbol1.len() + symbol2.len() > 8 {
                     continue;
                 }
                 let new_symbol = symbol1.concat(symbol2);
-                // as`sert the symbol is not empty
-                assert!(
-                    !new_symbol.is_empty(),
-                    "symbol made by merging {:?} and {:?} is empty",
-                    symbol1,
-                    symbol2,
-                );
-                let gain = counters.count2(code1, code2);
-                pqueue.push(Candidate {
-                    symbol: new_symbol,
-                    gain,
-                })
+                let gain = counters.count2(code1, code2) * new_symbol.len();
+                if gain > 0 {
+                    pqueue.push(Candidate {
+                        symbol: new_symbol,
+                        gain,
+                    })
+                }
             }
         }
 
@@ -145,6 +184,25 @@ impl Compressor {
             }
         }
 
+        // If there are leftover slots, fill them with ASCII chars.
+        // This helps reduce the number of escapes.
+        //
+        // Note that because of the lossy hash table, we won't accidentally
+        // save the same ASCII character twice into the table.
+        if include_ascii {
+            for character in
+                " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes()
+            {
+                if n_symbols == 255 {
+                    break;
+                }
+
+                if res.insert(Symbol::from_u8(character)) {
+                    n_symbols += 1
+                }
+            }
+        }
+
         res
     }
 }
@@ -152,6 +210,7 @@ impl Compressor {
 /// A candidate for inclusion in a symbol table.
 ///
 /// This is really only useful for the `optimize` step of training.
+#[derive(Copy, Clone, Debug)]
 struct Candidate {
     gain: usize,
     symbol: Symbol,
@@ -188,6 +247,7 @@ impl Ord for Candidate {
 
 #[cfg(test)]
 mod test {
+
     use crate::{Compressor, ESCAPE_CODE};
 
     #[test]
diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs
deleted file mode 100644
index 00eb7b2..0000000
--- a/src/find_longest/mod.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-mod naive;
-
-pub trait FindLongestSymbol {
-    fn find_longest_symbol(&self, text: &[u8]) -> u16;
-}
diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs
deleted file mode 100644
index e6519c1..0000000
--- a/src/find_longest/naive.rs
+++ /dev/null
@@ -1,28 +0,0 @@
-use crate::find_longest::FindLongestSymbol;
-use crate::Compressor;
-
-// Find the code that maps to a symbol with longest-match to a piece of text.
-//
-// This is the naive algorithm that just scans the whole table and is very slow.
-
-impl FindLongestSymbol for Compressor {
-    // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles.
-    #[inline(never)]
-    fn find_longest_symbol(&self, text: &[u8]) -> u16 {
-        debug_assert!(!text.is_empty(), "text must not be empty");
-
-        // Find the code that best maps to the provided text table here.
-        // Start with the code corresponding to the escape of the first character in the text
-        let mut best_code = text[0] as u16;
-        let mut best_overlap = 1;
-        for code in 256..(256 + self.n_symbols as u16) {
-            let symbol = &self.symbols[code as usize];
-            if symbol.is_prefix(text) && symbol.len() > best_overlap {
-                best_code = code;
-                best_overlap = symbol.len();
-            }
-        }
-
-        best_code
-    }
-}
diff --git a/src/lib.rs b/src/lib.rs
index 86619ed..82ed0d7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,11 +10,9 @@ macro_rules! assert_sizeof {
 
 use std::fmt::{Debug, Formatter};
 
-pub use builder::*;
 use lossy_pht::LossyPHT;
 
 mod builder;
-mod find_longest;
 mod lossy_pht;
 
 /// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`Compressor`][`crate::Compressor`] and
@@ -43,9 +41,7 @@ impl Symbol {
 
     /// Create a new single-byte symbol
     pub fn from_u8(value: u8) -> Self {
-        Self {
-            bytes: [value, 0, 0, 0, 0, 0, 0, 0],
-        }
+        Self { num: value as u64 }
     }
 }
 
@@ -56,6 +52,7 @@ impl Symbol {
     /// can contain fewer bytes, padded with 0x00. There is a special case of a symbol
     /// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000`
     /// but we want to interpret that as a one-byte symbol containing `0x00`.
+    #[allow(clippy::len_without_is_empty)]
     pub fn len(&self) -> usize {
         let numeric = unsafe { self.num };
         // For little-endian platforms, this counts the number of *trailing* zeros
@@ -71,13 +68,6 @@ impl Symbol {
         }
     }
 
-    /// Returns true if the symbol does not encode any bytes.
-    ///
-    /// Note that this should only be true for the zero code.
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
     #[inline]
     fn as_u64(&self) -> u64 {
         // SAFETY: the bytes can always be viewed as a u64
@@ -120,18 +110,43 @@ impl Symbol {
         let new_len = self_len + other.len();
         assert!(new_len <= 8, "cannot build symbol with length > 8");
 
-        let mut result = *self;
+        // SAFETY: we assert the combined length <= 8
+        unsafe {
+            Self {
+                num: (other.num << (8 * self_len)) | self.num,
+            }
+        }
+    }
+}
 
-        // SAFETY: self_len and new_len are checked to be <= 8
-        unsafe { result.bytes[self_len..new_len].copy_from_slice(other.as_slice()) };
+#[cfg(test)]
+mod test {
+    use crate::Symbol;
 
-        result
+    #[test]
+    fn test_concat() {
+        let symbola = Symbol::from_u8(b'a');
+        let symbolb = Symbol::from_u8(b'b');
+        let symbolab = symbola.concat(&symbolb);
+        assert_eq!(symbolab.as_slice(), b"ab");
     }
 }
 
 impl Debug for Symbol {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", unsafe { self.bytes })
+        let debug = self
+            .as_slice()
+            .iter()
+            .map(|c| *c as char)
+            .map(|c| {
+                if c.is_ascii() {
+                    format!("{c}")
+                } else {
+                    format!("{c:X?}")
+                }
+            })
+            .collect::<Vec<String>>();
+        write!(f, "{:?}", debug)
     }
 }
 
@@ -299,7 +314,7 @@ impl<'a> Decompressor<'a> {
 #[derive(Clone)]
 pub struct Compressor {
     /// Table mapping codes to symbols.
-    pub(crate) symbols: [Symbol; 511],
+    pub(crate) symbols: Vec<Symbol>,
 
     /// The number of entries in the symbol table that have been populated, not counting
     /// the escape values.
@@ -317,8 +332,14 @@ pub struct Compressor {
 
 impl Default for Compressor {
     fn default() -> Self {
+        // NOTE: `vec!` has a specialization for building a new vector of `0u64`. Because Symbol and u64
+        //  have the same bit pattern, we can allocate as u64 and transmute. If we do `vec![Symbol::EMPTY; N]`,
+        // that will create a new Vec and call `Symbol::EMPTY.clone()` `N` times which is considerably slower.
+        let symbols = vec![0u64; 511];
+        // SAFETY: transmute safety assured by the compiler.
+        let symbols: Vec<Symbol> = unsafe { std::mem::transmute(symbols) };
         let mut table = Self {
-            symbols: [Symbol::ZERO; 511],
+            symbols,
             n_symbols: 0,
             codes_twobyte: vec![CodeMeta::EMPTY; 65_536],
             lossy_pht: LossyPHT::new(),
@@ -379,9 +400,8 @@ impl Compressor {
     /// # Safety
     ///
     /// `out_ptr` must never be NULL or otherwise point to invalid memory.
-    // NOTE(aduffy): uncomment this line to make the function appear in profiles
-    #[inline(never)]
-    pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) {
+    #[inline]
+    pub unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) {
         // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and
         // if it isn't, it will be overwritten anyway.
         //
@@ -436,11 +456,11 @@ impl Compressor {
 
         // SAFETY: `end` will point just after the end of the `plaintext` slice.
         let in_end = unsafe { in_ptr.byte_add(plaintext.len()) };
-        let in_end_sub8 = unsafe { in_end.byte_sub(8) };
+        let in_end_sub8 = in_end as usize - 8;
         // SAFETY: `end` will point just after the end of the `values` allocation.
         let out_end = unsafe { out_ptr.byte_add(values.capacity()) };
 
-        while in_ptr < in_end_sub8 && out_ptr < out_end {
+        while (in_ptr as usize) < in_end_sub8 && out_ptr < out_end {
             // SAFETY: pointer ranges are checked in the loop condition
             unsafe {
                 // Load a full 8-byte word of data from in_ptr.
@@ -521,6 +541,7 @@ impl Compressor {
     }
 }
 
+#[inline]
 fn advance_8byte_word(word: u64, bytes: usize) -> u64 {
     // shift the word off the right-end, because little endian means the first
     // char is stored in the LSB.
@@ -534,6 +555,7 @@ fn advance_8byte_word(word: u64, bytes: usize) -> u64 {
     }
 }
 
+#[inline]
 fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool {
     let mask = if ignored_bits == 64 {
         0
@@ -547,6 +569,7 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool {
 /// This is a function that will get monomorphized based on the value of `N` to do
 /// a load of `N` values from the pointer in a minimum number of instructions into
 /// an output `u64`.
+#[inline]
 unsafe fn extract_u64<const N: usize>(ptr: *const u8) -> u64 {
     match N {
         1 => ptr.read() as u64,
diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs
index db4bcf5..460631e 100644
--- a/src/lossy_pht.rs
+++ b/src/lossy_pht.rs
@@ -60,15 +60,12 @@ pub(crate) struct LossyPHT {
 impl LossyPHT {
     /// Construct a new empty lossy perfect hash table
     pub(crate) fn new() -> Self {
-        let mut slots = Vec::with_capacity(HASH_TABLE_SIZE);
-        // Initialize all slots to empty entries
-        for _ in 0..HASH_TABLE_SIZE {
-            slots.push(TableEntry {
-                symbol: Symbol::ZERO,
-                code: CodeMeta::EMPTY,
-                ignored_bits: 64,
-            });
-        }
+        let slots = [TableEntry {
+            symbol: Symbol::ZERO,
+            code: CodeMeta::EMPTY,
+            ignored_bits: 64,
+        }]
+        .repeat(HASH_TABLE_SIZE);
 
         Self { slots }
     }
@@ -95,11 +92,13 @@ impl LossyPHT {
         }
     }
 
+    #[inline]
     pub(crate) fn lookup(&self, word: u64) -> TableEntry {
         let prefix_3bytes = word & 0xFF_FF_FF;
         let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
 
-        self.slots[slot]
+        // SAFETY: the slot is guaranteed to between 0...(HASH_TABLE_SIZE - 1).
+        unsafe { *self.slots.get_unchecked(slot) }
     }
 
     /// Hash a value to find the bucket it belongs in.
diff --git a/tests/correctness.rs b/tests/correctness.rs
index d557f06..5a68cb1 100644
--- a/tests/correctness.rs
+++ b/tests/correctness.rs
@@ -59,22 +59,17 @@ fn test_zeros() {
 
 #[test]
 fn test_large() {
-    let mut corpus = String::new();
-    // TODO(aduffy): make this larger once table build performance is better.
-    while corpus.len() < 10 * 1_024 {
-        corpus.push_str(DECLARATION);
-    }
+    let corpus: Vec<u8> = DECLARATION.bytes().cycle().take(10_240).collect();
 
     let trained = Compressor::train(&corpus);
-    let mut massive = String::new();
-    while massive.len() < 16 * 1_024 * 1_024 {
-        massive.push_str(DECLARATION);
-    }
-    let compressed = trained.compress(massive.as_bytes());
-    assert_eq!(
-        trained.decompressor().decompress(&compressed),
-        massive.as_bytes()
-    );
+    let massive: Vec<u8> = DECLARATION
+        .bytes()
+        .cycle()
+        .take(16 * 1_024 * 1_024)
+        .collect();
+
+    let compressed = trained.compress(&massive);
+    assert_eq!(trained.decompressor().decompress(&compressed), massive);
 }
 
 #[test]