From 3057f9e1173667e56acb4e9096bbd6ee64f866af Mon Sep 17 00:00:00 2001
From: Nico Wagner <n.wagner@dnb.de>
Date: Wed, 28 Aug 2024 18:51:14 +0200
Subject: [PATCH] Refactor `pica-format` crate (#809)

Signed-off-by: Nico Wagner <n.wagner@dnb.de>
---
 .github/workflows/ci.yml                      |   5 +-
 crates/pica-format/Cargo.toml                 |   1 +
 crates/pica-format/fuzz/.gitignore            |   4 +
 crates/pica-format/fuzz/Cargo.toml            |  24 ++
 .../fuzz/fuzz_targets/fuzz_format.rs          |  12 +
 crates/pica-format/src/lib.rs                 | 400 ++++++++----------
 crates/pica-format/src/parse.rs               | 327 +++++++-------
 crates/pica-format/tests/integration.rs       |  40 ++
 crates/pica-format/tests/main.rs              |   1 +
 9 files changed, 442 insertions(+), 372 deletions(-)
 create mode 100644 crates/pica-format/fuzz/.gitignore
 create mode 100644 crates/pica-format/fuzz/Cargo.toml
 create mode 100644 crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs
 create mode 100644 crates/pica-format/tests/integration.rs
 create mode 100644 crates/pica-format/tests/main.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bb9df39e9..bf58f721f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -89,10 +89,11 @@ jobs:
     strategy:
       matrix:
         item:
-          - { name: pica-record-ref, fuzz-dir: crates/pica-record/fuzz, target: fuzz-record-ref, max-total-time: 300 }
+          - { name: pica-format, fuzz-dir: crates/pica-format/fuzz, target: fuzz-format, max-total-time: 300 }
+          - { name: pica-path, fuzz-dir: crates/pica-path/fuzz, target: fuzz-path, max-total-time: 300 }
           - { name: pica-record-matcher, fuzz-dir: crates/pica-matcher/fuzz, target: fuzz-record-matcher, max-total-time: 300 }
+          - { name: pica-record-ref, fuzz-dir: crates/pica-record/fuzz, target: fuzz-record-ref, max-total-time: 300 }
           - { name: pica-select-query, fuzz-dir: crates/pica-select/fuzz, target: fuzz-query, max-total-time: 300 }
-          - { name: pica-path, fuzz-dir: crates/pica-path/fuzz, target: fuzz-path, max-total-time: 300 }
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@nightly
diff --git a/crates/pica-format/Cargo.toml b/crates/pica-format/Cargo.toml
index b12ece1a0..58308d686 100644
--- a/crates/pica-format/Cargo.toml
+++ b/crates/pica-format/Cargo.toml
@@ -10,6 +10,7 @@ rust-version.workspace = true
 
 [dependencies]
 bstr = { workspace = true }
+pica-matcher = { workspace = true }
 pica-record = { workspace = true }
 thiserror = { workspace = true }
 winnow = { workspace = true, features = ["simd"] }
diff --git a/crates/pica-format/fuzz/.gitignore b/crates/pica-format/fuzz/.gitignore
new file mode 100644
index 000000000..1a45eee77
--- /dev/null
+++ b/crates/pica-format/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/crates/pica-format/fuzz/Cargo.toml b/crates/pica-format/fuzz/Cargo.toml
new file mode 100644
index 000000000..44ec0f247
--- /dev/null
+++ b/crates/pica-format/fuzz/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "pica-format-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.pica-format]
+path = ".."
+
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "fuzz-format"
+path = "fuzz_targets/fuzz_format.rs"
+test = false
+doc = false
+bench = false
diff --git a/crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs b/crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs
new file mode 100644
index 000000000..4a5280cc9
--- /dev/null
+++ b/crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs
@@ -0,0 +1,12 @@
+#![no_main]
+
+use std::str::FromStr;
+
+use libfuzzer_sys::fuzz_target;
+use pica_format::Format;
+
+fuzz_target!(|data: &[u8]| {
+    if let Ok(s) = std::str::from_utf8(data) {
+        let _format = Format::from_str(s);
+    }
+});
diff --git a/crates/pica-format/src/lib.rs b/crates/pica-format/src/lib.rs
index f069289ab..adee57ecf 100644
--- a/crates/pica-format/src/lib.rs
+++ b/crates/pica-format/src/lib.rs
@@ -1,92 +1,162 @@
-use parse::parse_format;
-use pica_record::{FieldRef, SubfieldRef};
+use std::str::FromStr;
+
+use pica_matcher::{OccurrenceMatcher, SubfieldMatcher, TagMatcher};
+use pica_record::{ByteRecord, FieldRef};
 use thiserror::Error;
-use winnow::Parser;
+use winnow::prelude::*;
 
 mod parse;
 
+pub use parse::parse_format;
+
+#[derive(Error, Debug, Clone, PartialEq)]
+#[error("{0} is not a valid format string")]
+pub struct ParseFormatError(String);
+
+/// A pica format expression.
 #[derive(Debug, Clone, PartialEq)]
-pub struct Format(Vec<Fragment>);
+pub struct Format {
+    tag_matcher: TagMatcher,
+    occurrence_matcher: OccurrenceMatcher,
+    subfield_matcher: Option<SubfieldMatcher>,
+    fragments: Fragments,
+}
 
 impl Format {
-    pub fn fragments(&self) -> impl Iterator<Item = &Fragment> {
-        self.0.iter()
+    /// Create a new format from the given format string.
+    ///
+    /// # Panics
+    ///
+    /// If the give format string is invalid this function panics. To
+    /// catch the parse error use `Format::from_str`.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use pica_format::Format;
+    ///
+    /// # fn main() {
+    /// let format = Format::new("041[A@]{ a <$> b | a? }");
+    /// # }
+    /// ```
+    pub fn new(fmt: &str) -> Self {
+        Self::from_str(fmt).expect("valid format expression")
+    }
+
+    /// Returns the tag matcher of the format expression.
+    pub fn tag_matcher(&self) -> &TagMatcher {
+        &self.tag_matcher
+    }
+
+    /// Returns the occurrence matcher of the format expression.
+    pub fn occurrence_matcher(&self) -> &OccurrenceMatcher {
+        &self.occurrence_matcher
+    }
+
+    /// Retruns the subfield matcher of the format expression.
+    pub fn subfield_matcher(&self) -> Option<&SubfieldMatcher> {
+        self.subfield_matcher.as_ref()
+    }
+}
+
+impl FromStr for Format {
+    type Err = ParseFormatError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        parse_format
+            .parse(s.as_bytes())
+            .map_err(|_| ParseFormatError(s.to_string()))
     }
 }
 
 #[derive(Debug, Clone, PartialEq)]
-pub enum Fragment {
-    Atom(Atom),
+enum Fragments {
     Group(Group),
+    Value(Value),
+    List(List),
 }
 
 #[derive(Debug, Clone, PartialEq)]
-pub struct Atom {
+struct Value {
     codes: Vec<char>,
     prefix: Option<String>,
     suffix: Option<String>,
 }
 
-impl Atom {
-    fn format_subfield(
+impl Value {
+    fn format(
         &self,
-        buf: &mut String,
-        subfield: &SubfieldRef,
+        field: &FieldRef,
         options: &FormatOptions,
-    ) {
-        if !self.codes.contains(&subfield.code()) {
-            return;
-        }
+    ) -> Option<String> {
+        let subfield = self.codes.iter().find_map(|code| {
+            field.find(|subfield| subfield.code() == *code)
+        })?;
 
         let mut value = subfield.value().to_string();
+        if value.is_empty() {
+            return None;
+        }
+
         if options.strip_overread_char {
             value = value.replacen('@', "", 1);
         }
 
-        if !value.is_empty() {
-            if let Some(ref prefix) = self.prefix {
-                buf.push_str(prefix);
-            }
-
-            buf.push_str(&value);
+        if let Some(ref prefix) = self.prefix {
+            value.insert_str(0, prefix);
+        }
 
-            if let Some(ref suffix) = self.suffix {
-                buf.push_str(suffix);
-            }
+        if let Some(ref suffix) = self.suffix {
+            value.push_str(suffix)
         }
+
+        Some(value)
     }
 }
 
 #[derive(Debug, Clone, PartialEq)]
-pub struct Group {
-    atoms: Vec<Atom>,
+struct Group {
+    fragments: Box<Fragments>,
 }
 
-impl Fragment {
+#[derive(Debug, Clone, PartialEq)]
+enum List {
+    AndThen(Vec<Fragments>),
+    Cons(Vec<Fragments>),
+}
+
+impl List {
     fn format(
         &self,
-        buf: &mut String,
         field: &FieldRef,
         options: &FormatOptions,
-    ) {
+    ) -> Option<String> {
+        let mut acc = String::new();
+
         match self {
-            Self::Atom(atom) => {
-                if let Some(subfield) = atom
-                    .codes
-                    .iter()
-                    .find_map(|code| field.find(|s| s.code() == *code))
-                {
-                    atom.format_subfield(buf, subfield, options);
+            Self::AndThen(fragments) => {
+                for f in fragments.iter() {
+                    let Some(value) = f.format(field, options) else {
+                        break;
+                    };
+
+                    acc.push_str(&value);
                 }
             }
-            Self::Group(group) => {
-                field.subfields().iter().for_each(|subfield| {
-                    group.atoms.iter().for_each(|atom| {
-                        atom.format_subfield(buf, subfield, options);
-                    });
-                });
+            Self::Cons(fragments) => {
+                for f in fragments.iter() {
+                    if let Some(value) = f.format(field, options) {
+                        acc.push_str(&value);
+                    };
+                }
             }
         }
+
+        if !acc.is_empty() {
+            Some(acc)
+        } else {
+            None
+        }
     }
 }
 
@@ -95,14 +165,6 @@ pub struct FormatOptions {
     strip_overread_char: bool,
 }
 
-impl FormatOptions {
-    pub fn new(strip_overread_char: bool) -> Self {
-        Self {
-            strip_overread_char,
-        }
-    }
-}
-
 impl Default for FormatOptions {
     fn default() -> Self {
         Self {
@@ -111,202 +173,84 @@ impl Default for FormatOptions {
     }
 }
 
-#[derive(Error, Debug, Clone, PartialEq)]
-#[error("{0} is not a valid format string")]
-pub struct ParseFormatError(String);
+impl FormatOptions {
+    pub fn new() -> Self {
+        Self::default()
+    }
 
-impl Format {
-    /// Creates a new format from a string slice.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// use pica_format::Format;
-    ///
-    /// # fn main() { example().unwrap(); }
-    /// fn example() -> anyhow::Result<()> {
-    ///     let _fmt = Format::new("a")?;
-    ///     Ok(())
-    /// }
-    /// ```
-    pub fn new<T>(fmt: T) -> Result<Self, ParseFormatError>
-    where
-        T: AsRef<str>,
-    {
-        parse_format
-            .parse(fmt.as_ref())
-            .map_err(|_| ParseFormatError(fmt.as_ref().into()))
+    /// Whether to strip the overread character '@' from a value or not.
+    pub fn strip_overread_char(mut self, yes: bool) -> Self {
+        self.strip_overread_char = yes;
+        self
     }
 }
 
-pub trait FormatExt {
-    fn format(&self, fmt: &Format, options: &FormatOptions) -> String;
+impl Fragments {
+    fn format(
+        &self,
+        field: &FieldRef,
+        options: &FormatOptions,
+    ) -> Option<String> {
+        match self {
+            Self::Value(value) => value.format(field, options),
+            Self::List(list) => list.format(field, options),
+            Self::Group(Group { fragments }) => {
+                fragments.format(field, options)
+            }
+        }
+    }
 }
 
-impl FormatExt for FieldRef<'_> {
-    /// Formats a field reference according to the format string.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// use pica_format::{Format, FormatExt};
-    /// use pica_record::FieldRef;
-    ///
-    /// # fn main() { example().unwrap(); }
-    /// fn example() -> anyhow::Result<()> {
-    ///     let field =
-    ///         FieldRef::from_bytes(b"041A \x1faGoethe\x1e").unwrap();
-    ///     let format = Format::new("a")?;
-    ///     let options = Default::default();
-    ///     assert_eq!(field.format(&format, &options), "Goethe");
-    ///     Ok(())
-    /// }
-    /// ```
-    fn format(&self, fmt: &Format, options: &FormatOptions) -> String {
-        let mut buf = String::new();
-        fmt.fragments().for_each(|fragment| {
-            fragment.format(&mut buf, self, options);
-        });
+pub trait FormatExt {
+    fn format(
+        &self,
+        format: &Format,
+        options: &FormatOptions,
+    ) -> Vec<String>;
+}
 
-        buf
+impl FormatExt for ByteRecord<'_> {
+    fn format(
+        &self,
+        format: &Format,
+        options: &FormatOptions,
+    ) -> Vec<String> {
+        self.iter()
+            .filter(|field| field.tag() == format.tag_matcher())
+            .filter(|field| {
+                *format.occurrence_matcher() == field.occurrence()
+            })
+            .filter(|field| {
+                if let Some(m) = format.subfield_matcher() {
+                    m.is_match(field.subfields(), &Default::default())
+                } else {
+                    true
+                }
+            })
+            .filter_map(|field| format.fragments.format(field, options))
+            .collect()
     }
 }
 
 #[cfg(test)]
-mod test {
+mod tests {
     use super::*;
 
     type TestResult = anyhow::Result<()>;
 
     #[test]
-    fn test_format_subject_headings() -> TestResult {
-        let opts = FormatOptions::default();
-        let fmt = Format::new("a (' / ' x <|> ' (' g ')')")?;
-
-        let data = "041A \x1faPlymouth\x1fgMarke\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "Plymouth (Marke)");
-
-        let data = "041A \x1faSchlacht um Berlin\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "Schlacht um Berlin");
+    fn test_parse_format() -> TestResult {
+        let format =
+            Format::new("041A{ (a <*> b) <$> (c <*> d) | a? }");
 
-        let data = "041A \x1faDas @Gute\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "Das Gute");
-
-        let data =
-            "041A \x1faBarletta\x1fxDisfida di Barletta\x1fgMotiv\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
+        assert_eq!(format.tag_matcher(), &TagMatcher::new("041A"));
         assert_eq!(
-            field.format(&fmt, &opts),
-            "Barletta / Disfida di Barletta (Motiv)"
+            format.occurrence_matcher(),
+            &OccurrenceMatcher::None
         );
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_format_geographic_names() -> TestResult {
-        let opts = FormatOptions::default();
-        let fmt = Format::new("a (' (' [gz] ')' <|> ' / ' x)")?;
-
-        let data = "065A \x1faArgolis\x1fzNord\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "Argolis (Nord)");
-
-        let data = "065A \x1faUSA\x1fxSüdstaaten\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "USA / Südstaaten");
-
-        let data = "065A \x1faSanta Maria Maggiore\x1fgRom\
-            \x1fxKrippenkapelle\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(
-            field.format(&fmt, &opts),
-            "Santa Maria Maggiore (Rom) / Krippenkapelle"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_format_corporate_bodies() -> TestResult {
-        let opts = FormatOptions::default();
-        let fmt =
-            Format::new("a (' (' g ')' <|> ' / ' [xb] <|> ', ' n)")?;
-
-        let data = "029A \x1faThe @Hitmakers\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "The Hitmakers");
-
-        let data = "029A \x1faDeutschland\x1fgBundesrepublik\
-                    \x1fbAuswärtiges Amt\x1fbBibliothek\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(
-            field.format(&fmt, &opts),
-            "Deutschland (Bundesrepublik) / Auswärtiges Amt / Bibliothek"
-        );
-
-        let data = "029A \x1faTōkai Daigaku\x1fbKōgakubu\x1fn2\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(
-            field.format(&fmt, &opts),
-            "Tōkai Daigaku / Kōgakubu, 2"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_format_conferences() -> TestResult {
-        let opts = FormatOptions::default();
-        let fmt = Format::new(
-            "(n ' ') a (', ' d <|> ' (' c ')' <|> ' / ' [bx])",
-        )?;
-
-        let data = "030A \x1faInternationale Hofer Filmtage\
-                    \x1fn13.\x1fd1979\x1fcHof (Saale)\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(
-            field.format(&fmt, &opts),
-            "13. Internationale Hofer Filmtage, 1979 (Hof (Saale))"
-        );
-
-        let data = "030A \x1faOECD\x1fb\
-                    Ministerial Meeting on Science of OECD Countries\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(
-            field.format(&fmt, &opts),
-            "OECD / Ministerial Meeting on Science of OECD Countries"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_format_works() -> TestResult {
-        let opts = FormatOptions::default();
-        let fmt =
-            Format::new("a (' (' [fg] ')' <|> ', ' n <|> '. ' p)")?;
-
-        let data = "022A \x1faVerfassung\x1ff2011\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "Verfassung (2011)");
-
-        let data = "022A \x1faFaust\x1fn1\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "Faust, 1");
-
-        let data = "022A \x1faFaust\x1fgVolksbuch\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
-        assert_eq!(field.format(&fmt, &opts), "Faust (Volksbuch)");
-
-        let data = "022A \x1faBibel\x1fpPetrusbrief\x1fn1.-2.\x1e";
-        let field = FieldRef::from_bytes(data.as_bytes())?;
         assert_eq!(
-            field.format(&fmt, &opts),
-            "Bibel. Petrusbrief, 1.-2."
+            format.subfield_matcher().unwrap(),
+            &SubfieldMatcher::new("a?")
         );
 
         Ok(())
diff --git a/crates/pica-format/src/parse.rs b/crates/pica-format/src/parse.rs
index fc5a4d72d..ae1f5af3d 100644
--- a/crates/pica-format/src/parse.rs
+++ b/crates/pica-format/src/parse.rs
@@ -1,58 +1,206 @@
+use std::cell::RefCell;
+
+use bstr::ByteSlice;
+use pica_matcher::parser::{
+    parse_occurrence_matcher, parse_subfield_matcher, parse_tag_matcher,
+};
 use winnow::ascii::{multispace0, multispace1};
 use winnow::combinator::{
-    alt, delimited, opt, preceded, repeat, separated,
+    alt, delimited, opt, preceded, repeat, separated, terminated,
 };
 use winnow::error::{ContextError, ParserError};
 use winnow::prelude::*;
 use winnow::stream::{AsChar, Compare, Stream, StreamIsPartial};
 use winnow::token::{one_of, take_till};
 
-use crate::{Atom, Format, Fragment, Group};
+use crate::{Format, Fragments, Group, List, Value};
 
-/// Strip whitespaces from the beginning and end.
-pub(crate) fn ws<I, O, E: ParserError<I>, F>(
-    mut inner: F,
-) -> impl Parser<I, O, E>
-where
-    I: Stream + StreamIsPartial,
-    <I as Stream>::Token: AsChar + Clone,
-    F: Parser<I, O, E>,
-{
-    move |i: &mut I| {
-        let _ = multispace0.parse_next(i)?;
-        let o = inner.parse_next(i);
-        let _ = multispace0.parse_next(i)?;
-        o
-    }
+pub fn parse_format(i: &mut &[u8]) -> PResult<Format> {
+    (
+        parse_tag_matcher,
+        parse_occurrence_matcher,
+        delimited(
+            ws('{'),
+            (
+                parse_fragments,
+                opt(preceded(ws('|'), parse_subfield_matcher)),
+            ),
+            ws('}'),
+        ),
+    )
+        .map(|(t, o, (f, s))| Format {
+            tag_matcher: t,
+            occurrence_matcher: o,
+            subfield_matcher: s,
+            fragments: f,
+        })
+        .parse_next(i)
 }
 
-#[derive(Debug, Copy, Clone)]
-enum Quotes {
-    Single,
-    Double,
+fn parse_fragments(i: &mut &[u8]) -> PResult<Fragments> {
+    alt((
+        parse_list.map(Fragments::List),
+        parse_group.map(Fragments::Group),
+        parse_value.map(Fragments::Value),
+    ))
+    .parse_next(i)
+}
+
+fn parse_value(i: &mut &[u8]) -> PResult<Value> {
+    (opt(ws(parse_string)), parse_codes, opt(ws(parse_string)))
+        .map(|(prefix, codes, suffix)| Value {
+            prefix,
+            codes,
+            suffix,
+        })
+        .parse_next(i)
+}
+
+thread_local! {
+    pub static GROUP_LEVEL: RefCell<u32> = const { RefCell::new(0) };
+}
+
+fn increment_group_level(i: &mut &[u8]) -> PResult<()> {
+    GROUP_LEVEL.with(|level| {
+        *level.borrow_mut() += 1;
+        if *level.borrow() >= 32 {
+            Err(winnow::error::ErrMode::from_error_kind(
+                i,
+                winnow::error::ErrorKind::Many,
+            ))
+        } else {
+            Ok(())
+        }
+    })
+}
+
+fn decrement_group_level() {
+    GROUP_LEVEL.with(|level| {
+        *level.borrow_mut() -= 1;
+    })
+}
+
+fn parse_group(i: &mut &[u8]) -> PResult<Group> {
+    delimited(
+        terminated(ws('('), increment_group_level),
+        parse_fragments,
+        ws(')').map(|_| decrement_group_level()),
+    )
+    .map(|fragments| Group {
+        fragments: Box::new(fragments),
+    })
+    .parse_next(i)
+}
+
+fn parse_list(i: &mut &[u8]) -> PResult<List> {
+    alt((parse_list_cons, parse_list_and_then)).parse_next(i)
+}
+
+fn parse_list_cons(i: &mut &[u8]) -> PResult<List> {
+    separated(
+        2..,
+        alt((
+            parse_list_and_then.map(Fragments::List),
+            parse_group.map(Fragments::Group),
+            parse_value.map(Fragments::Value),
+        )),
+        ws("<*>"),
+    )
+    .map(List::Cons)
+    .parse_next(i)
+}
+
+fn parse_list_and_then(i: &mut &[u8]) -> PResult<List> {
+    separated(
+        2..,
+        alt((
+            parse_group.map(Fragments::Group),
+            parse_value.map(Fragments::Value),
+        )),
+        ws("<$>"),
+    )
+    .map(List::AndThen)
+    .parse_next(i)
 }
 
 /// Parses a subfield code (a single alpha-numeric character)
-fn parse_subfield_code(i: &mut &str) -> PResult<char> {
-    one_of(('0'..='9', 'a'..='z', 'A'..='Z')).parse_next(i)
+fn parse_code(i: &mut &[u8]) -> PResult<char> {
+    one_of(('0'..='9', 'a'..='z', 'A'..='Z'))
+        .map(char::from)
+        .parse_next(i)
 }
 
 /// Parses a sequence of subfield codes.
-fn parse_subfield_codes(i: &mut &str) -> PResult<Vec<char>> {
+fn parse_codes(i: &mut &[u8]) -> PResult<Vec<char>> {
     alt((
-        parse_subfield_code.map(|code| vec![code]),
-        delimited(ws('['), repeat(1.., parse_subfield_code), ws(']')),
+        parse_code.map(|code| vec![code]),
+        delimited(ws('['), repeat(2.., parse_code), ws(']')),
     ))
     .parse_next(i)
 }
 
+fn parse_string(i: &mut &[u8]) -> PResult<String> {
+    alt((
+        parse_quoted_string::<ContextError>(Quotes::Single),
+        parse_quoted_string::<ContextError>(Quotes::Double),
+    ))
+    .map(|s| s.to_str().expect("valid utf8").to_string())
+    .parse_next(i)
+}
+
+#[derive(Debug, Copy, Clone)]
+enum Quotes {
+    Single,
+    Double,
+}
+
 #[derive(Debug, Clone)]
 enum StringFragment<'a> {
-    Literal(&'a str),
+    Literal(&'a [u8]),
     EscapedChar(char),
     EscapedWs,
 }
 
+fn parse_quoted_string<'a, E>(
+    quotes: Quotes,
+) -> impl Parser<&'a [u8], Vec<u8>, E>
+where
+    E: ParserError<&'a [u8]>,
+{
+    use StringFragment::*;
+
+    let builder = repeat(
+        0..,
+        parse_quoted_string_fragment::<E>(quotes),
+    )
+    .fold(Vec::new, |mut acc, fragment| {
+        match fragment {
+            Literal(s) => acc.extend_from_slice(s),
+            EscapedChar(c) => acc.push(c as u8),
+            EscapedWs => {}
+        }
+
+        acc
+    });
+
+    match quotes {
+        Quotes::Single => delimited('\'', builder, '\''),
+        Quotes::Double => delimited('"', builder, '"'),
+    }
+}
+
+fn parse_quoted_string_fragment<'a, E: ParserError<&'a [u8]>>(
+    quotes: Quotes,
+) -> impl Parser<&'a [u8], StringFragment<'a>, E> {
+    use StringFragment::*;
+
+    alt((
+        parse_literal::<&'a [u8], E>(quotes).map(Literal),
+        parse_escaped_char::<&'a [u8], E>(quotes).map(EscapedChar),
+        preceded('\\', multispace1).value(EscapedWs),
+    ))
+}
+
 fn parse_literal<I, E>(
     quotes: Quotes,
 ) -> impl Parser<I, <I as Stream>::Slice, E>
@@ -93,122 +241,17 @@ where
     )
 }
 
-fn parse_quoted_string_fragment<'a, E: ParserError<&'a str>>(
-    quotes: Quotes,
-) -> impl Parser<&'a str, StringFragment<'a>, E> {
-    use StringFragment::*;
-
-    alt((
-        parse_literal::<&'a str, E>(quotes).map(Literal),
-        parse_escaped_char::<&'a str, E>(quotes).map(EscapedChar),
-        preceded('\\', multispace1).value(EscapedWs),
-    ))
-}
-
-fn parse_quoted_string<'a, E>(
-    quotes: Quotes,
-) -> impl Parser<&'a str, String, E>
+/// Strip whitespaces from the beginning and end.
+fn ws<I, O, E: ParserError<I>, F>(mut inner: F) -> impl Parser<I, O, E>
 where
-    E: ParserError<&'a str>,
+    I: Stream + StreamIsPartial,
+    <I as Stream>::Token: AsChar + Clone,
+    F: Parser<I, O, E>,
 {
-    use StringFragment::*;
-
-    let builder = repeat(
-        0..,
-        parse_quoted_string_fragment::<E>(quotes),
-    )
-    .fold(String::new, |mut acc, fragment| {
-        match fragment {
-            Literal(s) => acc.push_str(s),
-            EscapedChar(c) => acc.push(c),
-            EscapedWs => {}
-        }
-
-        acc
-    });
-
-    match quotes {
-        Quotes::Single => delimited('\'', builder, '\''),
-        Quotes::Double => delimited('"', builder, '"'),
-    }
-}
-
-fn parse_single_quoted_string(i: &mut &str) -> PResult<String> {
-    parse_quoted_string::<ContextError>(Quotes::Single).parse_next(i)
-}
-
-fn parse_double_quoted_string(i: &mut &str) -> PResult<String> {
-    parse_quoted_string::<ContextError>(Quotes::Double).parse_next(i)
-}
-
-fn parse_string(i: &mut &str) -> PResult<String> {
-    alt((parse_single_quoted_string, parse_double_quoted_string))
-        .parse_next(i)
-}
-
-fn parse_atom(i: &mut &str) -> PResult<Atom> {
-    (
-        opt(ws(parse_string)),
-        ws(parse_subfield_codes),
-        opt(ws(parse_string)),
-    )
-        .map(|(prefix, codes, suffix)| Atom {
-            prefix,
-            codes,
-            suffix,
-        })
-        .parse_next(i)
-}
-
-fn parse_group(i: &mut &str) -> PResult<Group> {
-    delimited(
-        ws('('),
-        ws(separated(1.., parse_atom, ws("<|>"))
-            .map(|atoms| Group { atoms })),
-        ws(')'),
-    )
-    .parse_next(i)
-}
-
-/// Parses a format fragment.
-fn parse_fragment(i: &mut &str) -> PResult<Fragment> {
-    alt((
-        ws(parse_atom).map(Fragment::Atom),
-        ws(parse_group).map(Fragment::Group),
-    ))
-    .parse_next(i)
-}
-
-/// Parses a format string.
-pub(crate) fn parse_format(i: &mut &str) -> PResult<Format> {
-    repeat(1.., parse_fragment).map(Format).parse_next(i)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    #[cfg_attr(miri, ignore)]
-    fn test_parse_subfield_code() {
-        for c in '\0'..=char::MAX {
-            if c.is_ascii_alphanumeric() {
-                assert_eq!(
-                    parse_subfield_codes.parse(&format!("[{c}]")),
-                    Ok(vec![c])
-                );
-                assert_eq!(
-                    parse_subfield_codes.parse(&format!("{c}")),
-                    Ok(vec![c])
-                );
-            } else {
-                assert!(parse_subfield_codes
-                    .parse(&format!("$[{c}]"))
-                    .is_err());
-                assert!(parse_subfield_codes
-                    .parse(&format!("${c}"))
-                    .is_err());
-            }
-        }
+    move |i: &mut I| {
+        let _ = multispace0.parse_next(i)?;
+        let o = inner.parse_next(i);
+        let _ = multispace0.parse_next(i)?;
+        o
     }
 }
diff --git a/crates/pica-format/tests/integration.rs b/crates/pica-format/tests/integration.rs
new file mode 100644
index 000000000..cb8ced61b
--- /dev/null
+++ b/crates/pica-format/tests/integration.rs
@@ -0,0 +1,40 @@
+use std::str::FromStr;
+use std::sync::OnceLock;
+
+use pica_format::{Format, FormatExt};
+use pica_record::ByteRecord;
+
+fn ada_lovelace() -> &'static [u8] {
+    use std::path::Path;
+    use std::{env, fs};
+
+    static DATA: OnceLock<Vec<u8>> = OnceLock::new();
+    DATA.get_or_init(|| {
+        let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+        let path = Path::new(&manifest_dir)
+            .join("../pica-toolkit/tests/data/119232022.dat");
+        fs::read_to_string(&path).unwrap().as_bytes().to_vec()
+    })
+}
+
+#[test]
+fn test_format() -> anyhow::Result<()> {
+    let ada = ByteRecord::from_bytes(ada_lovelace()).expect("record");
+    let fmt = Format::from_str("028A{ a <$> (', ' d <*> ' ' c) }")?;
+    let result = ada.format(&fmt, &Default::default());
+    assert_eq!(result, vec!["Lovelace, Ada King of".to_string()]);
+
+    Ok(())
+}
+
+#[test]
+fn test_format_predicate() -> anyhow::Result<()> {
+    let ada = ByteRecord::from_bytes(ada_lovelace()).expect("record");
+    let fmt = Format::from_str(
+        "028[A@]{ a <$> (', ' d <*> ' ' c) | 4 == 'nafr'}",
+    )?;
+    let result = ada.format(&fmt, &Default::default());
+    assert_eq!(result, vec!["Byron, Ada Augusta".to_string()]);
+
+    Ok(())
+}
diff --git a/crates/pica-format/tests/main.rs b/crates/pica-format/tests/main.rs
new file mode 100644
index 000000000..6d3bbe604
--- /dev/null
+++ b/crates/pica-format/tests/main.rs
@@ -0,0 +1 @@
+mod integration;