From 3057f9e1173667e56acb4e9096bbd6ee64f866af Mon Sep 17 00:00:00 2001 From: Nico Wagner Date: Wed, 28 Aug 2024 18:51:14 +0200 Subject: [PATCH] Refactor `pica-format` crate (#809) Signed-off-by: Nico Wagner --- .github/workflows/ci.yml | 5 +- crates/pica-format/Cargo.toml | 1 + crates/pica-format/fuzz/.gitignore | 4 + crates/pica-format/fuzz/Cargo.toml | 24 ++ .../fuzz/fuzz_targets/fuzz_format.rs | 12 + crates/pica-format/src/lib.rs | 400 ++++++++---------- crates/pica-format/src/parse.rs | 327 +++++++------- crates/pica-format/tests/integration.rs | 40 ++ crates/pica-format/tests/main.rs | 1 + 9 files changed, 442 insertions(+), 372 deletions(-) create mode 100644 crates/pica-format/fuzz/.gitignore create mode 100644 crates/pica-format/fuzz/Cargo.toml create mode 100644 crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs create mode 100644 crates/pica-format/tests/integration.rs create mode 100644 crates/pica-format/tests/main.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb9df39e9..bf58f721f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -89,10 +89,11 @@ jobs: strategy: matrix: item: - - { name: pica-record-ref, fuzz-dir: crates/pica-record/fuzz, target: fuzz-record-ref, max-total-time: 300 } + - { name: pica-format, fuzz-dir: crates/pica-format/fuzz, target: fuzz-format, max-total-time: 300 } + - { name: pica-path, fuzz-dir: crates/pica-path/fuzz, target: fuzz-path, max-total-time: 300 } - { name: pica-record-matcher, fuzz-dir: crates/pica-matcher/fuzz, target: fuzz-record-matcher, max-total-time: 300 } + - { name: pica-record-ref, fuzz-dir: crates/pica-record/fuzz, target: fuzz-record-ref, max-total-time: 300 } - { name: pica-select-query, fuzz-dir: crates/pica-select/fuzz, target: fuzz-query, max-total-time: 300 } - - { name: pica-path, fuzz-dir: crates/pica-path/fuzz, target: fuzz-path, max-total-time: 300 } steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@nightly diff --git a/crates/pica-format/Cargo.toml b/crates/pica-format/Cargo.toml index b12ece1a0..58308d686 100644 --- a/crates/pica-format/Cargo.toml +++ b/crates/pica-format/Cargo.toml @@ -10,6 +10,7 @@ rust-version.workspace = true [dependencies] bstr = { workspace = true } +pica-matcher = { workspace = true } pica-record = { workspace = true } thiserror = { workspace = true } winnow = { workspace = true, features = ["simd"] } diff --git a/crates/pica-format/fuzz/.gitignore b/crates/pica-format/fuzz/.gitignore new file mode 100644 index 000000000..1a45eee77 --- /dev/null +++ b/crates/pica-format/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/crates/pica-format/fuzz/Cargo.toml b/crates/pica-format/fuzz/Cargo.toml new file mode 100644 index 000000000..44ec0f247 --- /dev/null +++ b/crates/pica-format/fuzz/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "pica-format-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.pica-format] +path = ".." + +[workspace] +members = ["."] + +[[bin]] +name = "fuzz-format" +path = "fuzz_targets/fuzz_format.rs" +test = false +doc = false +bench = false diff --git a/crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs b/crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs new file mode 100644 index 000000000..4a5280cc9 --- /dev/null +++ b/crates/pica-format/fuzz/fuzz_targets/fuzz_format.rs @@ -0,0 +1,12 @@ +#![no_main] + +use std::str::FromStr; + +use libfuzzer_sys::fuzz_target; +use pica_format::Format; + +fuzz_target!(|data: &[u8]| { + if let Ok(s) = std::str::from_utf8(data) { + let _format = Format::from_str(s); + } +}); diff --git a/crates/pica-format/src/lib.rs b/crates/pica-format/src/lib.rs index f069289ab..adee57ecf 100644 --- a/crates/pica-format/src/lib.rs +++ b/crates/pica-format/src/lib.rs @@ -1,92 +1,162 @@ -use parse::parse_format; -use pica_record::{FieldRef, SubfieldRef}; +use std::str::FromStr; + +use pica_matcher::{OccurrenceMatcher, SubfieldMatcher, TagMatcher}; +use pica_record::{ByteRecord, FieldRef}; use thiserror::Error; -use winnow::Parser; +use winnow::prelude::*; mod parse; +pub use parse::parse_format; + +#[derive(Error, Debug, Clone, PartialEq)] +#[error("{0} is not a valid format string")] +pub struct ParseFormatError(String); + +/// A pica format expression. #[derive(Debug, Clone, PartialEq)] -pub struct Format(Vec); +pub struct Format { + tag_matcher: TagMatcher, + occurrence_matcher: OccurrenceMatcher, + subfield_matcher: Option, + fragments: Fragments, +} impl Format { - pub fn fragments(&self) -> impl Iterator { - self.0.iter() + /// Create a new format from the given format string. + /// + /// # Panics + /// + /// If the give format string is invalid this function panics. To + /// catch the parse error use `Format::from_str`. + /// + /// # Example + /// + /// ```rust + /// use pica_format::Format; + /// + /// # fn main() { + /// let format = Format::new("041[A@]{ a <$> b | a? }"); + /// # } + /// ``` + pub fn new(fmt: &str) -> Self { + Self::from_str(fmt).expect("valid format expression") + } + + /// Returns the tag matcher of the format expression. + pub fn tag_matcher(&self) -> &TagMatcher { + &self.tag_matcher + } + + /// Returns the occurrence matcher of the format expression. + pub fn occurrence_matcher(&self) -> &OccurrenceMatcher { + &self.occurrence_matcher + } + + /// Retruns the subfield matcher of the format expression. + pub fn subfield_matcher(&self) -> Option<&SubfieldMatcher> { + self.subfield_matcher.as_ref() + } +} + +impl FromStr for Format { + type Err = ParseFormatError; + + fn from_str(s: &str) -> Result { + parse_format + .parse(s.as_bytes()) + .map_err(|_| ParseFormatError(s.to_string())) } } #[derive(Debug, Clone, PartialEq)] -pub enum Fragment { - Atom(Atom), +enum Fragments { Group(Group), + Value(Value), + List(List), } #[derive(Debug, Clone, PartialEq)] -pub struct Atom { +struct Value { codes: Vec, prefix: Option, suffix: Option, } -impl Atom { - fn format_subfield( +impl Value { + fn format( &self, - buf: &mut String, - subfield: &SubfieldRef, + field: &FieldRef, options: &FormatOptions, - ) { - if !self.codes.contains(&subfield.code()) { - return; - } + ) -> Option { + let subfield = self.codes.iter().find_map(|code| { + field.find(|subfield| subfield.code() == *code) + })?; let mut value = subfield.value().to_string(); + if value.is_empty() { + return None; + } + if options.strip_overread_char { value = value.replacen('@', "", 1); } - if !value.is_empty() { - if let Some(ref prefix) = self.prefix { - buf.push_str(prefix); - } - - buf.push_str(&value); + if let Some(ref prefix) = self.prefix { + value.insert_str(0, prefix); + } - if let Some(ref suffix) = self.suffix { - buf.push_str(suffix); - } + if let Some(ref suffix) = self.suffix { + value.push_str(suffix) } + + Some(value) } } #[derive(Debug, Clone, PartialEq)] -pub struct Group { - atoms: Vec, +struct Group { + fragments: Box, } -impl Fragment { +#[derive(Debug, Clone, PartialEq)] +enum List { + AndThen(Vec), + Cons(Vec), +} + +impl List { fn format( &self, - buf: &mut String, field: &FieldRef, options: &FormatOptions, - ) { + ) -> Option { + let mut acc = String::new(); + match self { - Self::Atom(atom) => { - if let Some(subfield) = atom - .codes - .iter() - .find_map(|code| field.find(|s| s.code() == *code)) - { - atom.format_subfield(buf, subfield, options); + Self::AndThen(fragments) => { + for f in fragments.iter() { + let Some(value) = f.format(field, options) else { + break; + }; + + acc.push_str(&value); } } - Self::Group(group) => { - field.subfields().iter().for_each(|subfield| { - group.atoms.iter().for_each(|atom| { - atom.format_subfield(buf, subfield, options); - }); - }); + Self::Cons(fragments) => { + for f in fragments.iter() { + if let Some(value) = f.format(field, options) { + acc.push_str(&value); + }; + } } } + + if !acc.is_empty() { + Some(acc) + } else { + None + } } } @@ -95,14 +165,6 @@ pub struct FormatOptions { strip_overread_char: bool, } -impl FormatOptions { - pub fn new(strip_overread_char: bool) -> Self { - Self { - strip_overread_char, - } - } -} - impl Default for FormatOptions { fn default() -> Self { Self { @@ -111,202 +173,84 @@ impl Default for FormatOptions { } } -#[derive(Error, Debug, Clone, PartialEq)] -#[error("{0} is not a valid format string")] -pub struct ParseFormatError(String); +impl FormatOptions { + pub fn new() -> Self { + Self::default() + } -impl Format { - /// Creates a new format from a string slice. - /// - /// # Example - /// - /// ```rust - /// use pica_format::Format; - /// - /// # fn main() { example().unwrap(); } - /// fn example() -> anyhow::Result<()> { - /// let _fmt = Format::new("a")?; - /// Ok(()) - /// } - /// ``` - pub fn new(fmt: T) -> Result - where - T: AsRef, - { - parse_format - .parse(fmt.as_ref()) - .map_err(|_| ParseFormatError(fmt.as_ref().into())) + /// Whether to strip the overread character '@' from a value or not. + pub fn strip_overread_char(mut self, yes: bool) -> Self { + self.strip_overread_char = yes; + self } } -pub trait FormatExt { - fn format(&self, fmt: &Format, options: &FormatOptions) -> String; +impl Fragments { + fn format( + &self, + field: &FieldRef, + options: &FormatOptions, + ) -> Option { + match self { + Self::Value(value) => value.format(field, options), + Self::List(list) => list.format(field, options), + Self::Group(Group { fragments }) => { + fragments.format(field, options) + } + } + } } -impl FormatExt for FieldRef<'_> { - /// Formats a field reference according to the format string. - /// - /// # Example - /// - /// ```rust - /// use pica_format::{Format, FormatExt}; - /// use pica_record::FieldRef; - /// - /// # fn main() { example().unwrap(); } - /// fn example() -> anyhow::Result<()> { - /// let field = - /// FieldRef::from_bytes(b"041A \x1faGoethe\x1e").unwrap(); - /// let format = Format::new("a")?; - /// let options = Default::default(); - /// assert_eq!(field.format(&format, &options), "Goethe"); - /// Ok(()) - /// } - /// ``` - fn format(&self, fmt: &Format, options: &FormatOptions) -> String { - let mut buf = String::new(); - fmt.fragments().for_each(|fragment| { - fragment.format(&mut buf, self, options); - }); +pub trait FormatExt { + fn format( + &self, + format: &Format, + options: &FormatOptions, + ) -> Vec; +} - buf +impl FormatExt for ByteRecord<'_> { + fn format( + &self, + format: &Format, + options: &FormatOptions, + ) -> Vec { + self.iter() + .filter(|field| field.tag() == format.tag_matcher()) + .filter(|field| { + *format.occurrence_matcher() == field.occurrence() + }) + .filter(|field| { + if let Some(m) = format.subfield_matcher() { + m.is_match(field.subfields(), &Default::default()) + } else { + true + } + }) + .filter_map(|field| format.fragments.format(field, options)) + .collect() } } #[cfg(test)] -mod test { +mod tests { use super::*; type TestResult = anyhow::Result<()>; #[test] - fn test_format_subject_headings() -> TestResult { - let opts = FormatOptions::default(); - let fmt = Format::new("a (' / ' x <|> ' (' g ')')")?; - - let data = "041A \x1faPlymouth\x1fgMarke\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "Plymouth (Marke)"); - - let data = "041A \x1faSchlacht um Berlin\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "Schlacht um Berlin"); + fn test_parse_format() -> TestResult { + let format = + Format::new("041A{ (a <*> b) <$> (c <*> d) | a? }"); - let data = "041A \x1faDas @Gute\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "Das Gute"); - - let data = - "041A \x1faBarletta\x1fxDisfida di Barletta\x1fgMotiv\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; + assert_eq!(format.tag_matcher(), &TagMatcher::new("041A")); assert_eq!( - field.format(&fmt, &opts), - "Barletta / Disfida di Barletta (Motiv)" + format.occurrence_matcher(), + &OccurrenceMatcher::None ); - - Ok(()) - } - - #[test] - fn test_format_geographic_names() -> TestResult { - let opts = FormatOptions::default(); - let fmt = Format::new("a (' (' [gz] ')' <|> ' / ' x)")?; - - let data = "065A \x1faArgolis\x1fzNord\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "Argolis (Nord)"); - - let data = "065A \x1faUSA\x1fxSüdstaaten\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "USA / Südstaaten"); - - let data = "065A \x1faSanta Maria Maggiore\x1fgRom\ - \x1fxKrippenkapelle\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!( - field.format(&fmt, &opts), - "Santa Maria Maggiore (Rom) / Krippenkapelle" - ); - - Ok(()) - } - - #[test] - fn test_format_corporate_bodies() -> TestResult { - let opts = FormatOptions::default(); - let fmt = - Format::new("a (' (' g ')' <|> ' / ' [xb] <|> ', ' n)")?; - - let data = "029A \x1faThe @Hitmakers\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "The Hitmakers"); - - let data = "029A \x1faDeutschland\x1fgBundesrepublik\ - \x1fbAuswärtiges Amt\x1fbBibliothek\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!( - field.format(&fmt, &opts), - "Deutschland (Bundesrepublik) / Auswärtiges Amt / Bibliothek" - ); - - let data = "029A \x1faTōkai Daigaku\x1fbKōgakubu\x1fn2\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!( - field.format(&fmt, &opts), - "Tōkai Daigaku / Kōgakubu, 2" - ); - - Ok(()) - } - - #[test] - fn test_format_conferences() -> TestResult { - let opts = FormatOptions::default(); - let fmt = Format::new( - "(n ' ') a (', ' d <|> ' (' c ')' <|> ' / ' [bx])", - )?; - - let data = "030A \x1faInternationale Hofer Filmtage\ - \x1fn13.\x1fd1979\x1fcHof (Saale)\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!( - field.format(&fmt, &opts), - "13. Internationale Hofer Filmtage, 1979 (Hof (Saale))" - ); - - let data = "030A \x1faOECD\x1fb\ - Ministerial Meeting on Science of OECD Countries\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!( - field.format(&fmt, &opts), - "OECD / Ministerial Meeting on Science of OECD Countries" - ); - - Ok(()) - } - - #[test] - fn test_format_works() -> TestResult { - let opts = FormatOptions::default(); - let fmt = - Format::new("a (' (' [fg] ')' <|> ', ' n <|> '. ' p)")?; - - let data = "022A \x1faVerfassung\x1ff2011\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "Verfassung (2011)"); - - let data = "022A \x1faFaust\x1fn1\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "Faust, 1"); - - let data = "022A \x1faFaust\x1fgVolksbuch\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; - assert_eq!(field.format(&fmt, &opts), "Faust (Volksbuch)"); - - let data = "022A \x1faBibel\x1fpPetrusbrief\x1fn1.-2.\x1e"; - let field = FieldRef::from_bytes(data.as_bytes())?; assert_eq!( - field.format(&fmt, &opts), - "Bibel. Petrusbrief, 1.-2." + format.subfield_matcher().unwrap(), + &SubfieldMatcher::new("a?") ); Ok(()) diff --git a/crates/pica-format/src/parse.rs b/crates/pica-format/src/parse.rs index fc5a4d72d..ae1f5af3d 100644 --- a/crates/pica-format/src/parse.rs +++ b/crates/pica-format/src/parse.rs @@ -1,58 +1,206 @@ +use std::cell::RefCell; + +use bstr::ByteSlice; +use pica_matcher::parser::{ + parse_occurrence_matcher, parse_subfield_matcher, parse_tag_matcher, +}; use winnow::ascii::{multispace0, multispace1}; use winnow::combinator::{ - alt, delimited, opt, preceded, repeat, separated, + alt, delimited, opt, preceded, repeat, separated, terminated, }; use winnow::error::{ContextError, ParserError}; use winnow::prelude::*; use winnow::stream::{AsChar, Compare, Stream, StreamIsPartial}; use winnow::token::{one_of, take_till}; -use crate::{Atom, Format, Fragment, Group}; +use crate::{Format, Fragments, Group, List, Value}; -/// Strip whitespaces from the beginning and end. -pub(crate) fn ws, F>( - mut inner: F, -) -> impl Parser -where - I: Stream + StreamIsPartial, - ::Token: AsChar + Clone, - F: Parser, -{ - move |i: &mut I| { - let _ = multispace0.parse_next(i)?; - let o = inner.parse_next(i); - let _ = multispace0.parse_next(i)?; - o - } +pub fn parse_format(i: &mut &[u8]) -> PResult { + ( + parse_tag_matcher, + parse_occurrence_matcher, + delimited( + ws('{'), + ( + parse_fragments, + opt(preceded(ws('|'), parse_subfield_matcher)), + ), + ws('}'), + ), + ) + .map(|(t, o, (f, s))| Format { + tag_matcher: t, + occurrence_matcher: o, + subfield_matcher: s, + fragments: f, + }) + .parse_next(i) } -#[derive(Debug, Copy, Clone)] -enum Quotes { - Single, - Double, +fn parse_fragments(i: &mut &[u8]) -> PResult { + alt(( + parse_list.map(Fragments::List), + parse_group.map(Fragments::Group), + parse_value.map(Fragments::Value), + )) + .parse_next(i) +} + +fn parse_value(i: &mut &[u8]) -> PResult { + (opt(ws(parse_string)), parse_codes, opt(ws(parse_string))) + .map(|(prefix, codes, suffix)| Value { + prefix, + codes, + suffix, + }) + .parse_next(i) +} + +thread_local! { + pub static GROUP_LEVEL: RefCell = const { RefCell::new(0) }; +} + +fn increment_group_level(i: &mut &[u8]) -> PResult<()> { + GROUP_LEVEL.with(|level| { + *level.borrow_mut() += 1; + if *level.borrow() >= 32 { + Err(winnow::error::ErrMode::from_error_kind( + i, + winnow::error::ErrorKind::Many, + )) + } else { + Ok(()) + } + }) +} + +fn decrement_group_level() { + GROUP_LEVEL.with(|level| { + *level.borrow_mut() -= 1; + }) +} + +fn parse_group(i: &mut &[u8]) -> PResult { + delimited( + terminated(ws('('), increment_group_level), + parse_fragments, + ws(')').map(|_| decrement_group_level()), + ) + .map(|fragments| Group { + fragments: Box::new(fragments), + }) + .parse_next(i) +} + +fn parse_list(i: &mut &[u8]) -> PResult { + alt((parse_list_cons, parse_list_and_then)).parse_next(i) +} + +fn parse_list_cons(i: &mut &[u8]) -> PResult { + separated( + 2.., + alt(( + parse_list_and_then.map(Fragments::List), + parse_group.map(Fragments::Group), + parse_value.map(Fragments::Value), + )), + ws("<*>"), + ) + .map(List::Cons) + .parse_next(i) +} + +fn parse_list_and_then(i: &mut &[u8]) -> PResult { + separated( + 2.., + alt(( + parse_group.map(Fragments::Group), + parse_value.map(Fragments::Value), + )), + ws("<$>"), + ) + .map(List::AndThen) + .parse_next(i) } /// Parses a subfield code (a single alpha-numeric character) -fn parse_subfield_code(i: &mut &str) -> PResult { - one_of(('0'..='9', 'a'..='z', 'A'..='Z')).parse_next(i) +fn parse_code(i: &mut &[u8]) -> PResult { + one_of(('0'..='9', 'a'..='z', 'A'..='Z')) + .map(char::from) + .parse_next(i) } /// Parses a sequence of subfield codes. -fn parse_subfield_codes(i: &mut &str) -> PResult> { +fn parse_codes(i: &mut &[u8]) -> PResult> { alt(( - parse_subfield_code.map(|code| vec![code]), - delimited(ws('['), repeat(1.., parse_subfield_code), ws(']')), + parse_code.map(|code| vec![code]), + delimited(ws('['), repeat(2.., parse_code), ws(']')), )) .parse_next(i) } +fn parse_string(i: &mut &[u8]) -> PResult { + alt(( + parse_quoted_string::(Quotes::Single), + parse_quoted_string::(Quotes::Double), + )) + .map(|s| s.to_str().expect("valid utf8").to_string()) + .parse_next(i) +} + +#[derive(Debug, Copy, Clone)] +enum Quotes { + Single, + Double, +} + #[derive(Debug, Clone)] enum StringFragment<'a> { - Literal(&'a str), + Literal(&'a [u8]), EscapedChar(char), EscapedWs, } +fn parse_quoted_string<'a, E>( + quotes: Quotes, +) -> impl Parser<&'a [u8], Vec, E> +where + E: ParserError<&'a [u8]>, +{ + use StringFragment::*; + + let builder = repeat( + 0.., + parse_quoted_string_fragment::(quotes), + ) + .fold(Vec::new, |mut acc, fragment| { + match fragment { + Literal(s) => acc.extend_from_slice(s), + EscapedChar(c) => acc.push(c as u8), + EscapedWs => {} + } + + acc + }); + + match quotes { + Quotes::Single => delimited('\'', builder, '\''), + Quotes::Double => delimited('"', builder, '"'), + } +} + +fn parse_quoted_string_fragment<'a, E: ParserError<&'a [u8]>>( + quotes: Quotes, +) -> impl Parser<&'a [u8], StringFragment<'a>, E> { + use StringFragment::*; + + alt(( + parse_literal::<&'a [u8], E>(quotes).map(Literal), + parse_escaped_char::<&'a [u8], E>(quotes).map(EscapedChar), + preceded('\\', multispace1).value(EscapedWs), + )) +} + fn parse_literal( quotes: Quotes, ) -> impl Parser::Slice, E> @@ -93,122 +241,17 @@ where ) } -fn parse_quoted_string_fragment<'a, E: ParserError<&'a str>>( - quotes: Quotes, -) -> impl Parser<&'a str, StringFragment<'a>, E> { - use StringFragment::*; - - alt(( - parse_literal::<&'a str, E>(quotes).map(Literal), - parse_escaped_char::<&'a str, E>(quotes).map(EscapedChar), - preceded('\\', multispace1).value(EscapedWs), - )) -} - -fn parse_quoted_string<'a, E>( - quotes: Quotes, -) -> impl Parser<&'a str, String, E> +/// Strip whitespaces from the beginning and end. +fn ws, F>(mut inner: F) -> impl Parser where - E: ParserError<&'a str>, + I: Stream + StreamIsPartial, + ::Token: AsChar + Clone, + F: Parser, { - use StringFragment::*; - - let builder = repeat( - 0.., - parse_quoted_string_fragment::(quotes), - ) - .fold(String::new, |mut acc, fragment| { - match fragment { - Literal(s) => acc.push_str(s), - EscapedChar(c) => acc.push(c), - EscapedWs => {} - } - - acc - }); - - match quotes { - Quotes::Single => delimited('\'', builder, '\''), - Quotes::Double => delimited('"', builder, '"'), - } -} - -fn parse_single_quoted_string(i: &mut &str) -> PResult { - parse_quoted_string::(Quotes::Single).parse_next(i) -} - -fn parse_double_quoted_string(i: &mut &str) -> PResult { - parse_quoted_string::(Quotes::Double).parse_next(i) -} - -fn parse_string(i: &mut &str) -> PResult { - alt((parse_single_quoted_string, parse_double_quoted_string)) - .parse_next(i) -} - -fn parse_atom(i: &mut &str) -> PResult { - ( - opt(ws(parse_string)), - ws(parse_subfield_codes), - opt(ws(parse_string)), - ) - .map(|(prefix, codes, suffix)| Atom { - prefix, - codes, - suffix, - }) - .parse_next(i) -} - -fn parse_group(i: &mut &str) -> PResult { - delimited( - ws('('), - ws(separated(1.., parse_atom, ws("<|>")) - .map(|atoms| Group { atoms })), - ws(')'), - ) - .parse_next(i) -} - -/// Parses a format fragment. -fn parse_fragment(i: &mut &str) -> PResult { - alt(( - ws(parse_atom).map(Fragment::Atom), - ws(parse_group).map(Fragment::Group), - )) - .parse_next(i) -} - -/// Parses a format string. -pub(crate) fn parse_format(i: &mut &str) -> PResult { - repeat(1.., parse_fragment).map(Format).parse_next(i) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - #[cfg_attr(miri, ignore)] - fn test_parse_subfield_code() { - for c in '\0'..=char::MAX { - if c.is_ascii_alphanumeric() { - assert_eq!( - parse_subfield_codes.parse(&format!("[{c}]")), - Ok(vec![c]) - ); - assert_eq!( - parse_subfield_codes.parse(&format!("{c}")), - Ok(vec![c]) - ); - } else { - assert!(parse_subfield_codes - .parse(&format!("$[{c}]")) - .is_err()); - assert!(parse_subfield_codes - .parse(&format!("${c}")) - .is_err()); - } - } + move |i: &mut I| { + let _ = multispace0.parse_next(i)?; + let o = inner.parse_next(i); + let _ = multispace0.parse_next(i)?; + o } } diff --git a/crates/pica-format/tests/integration.rs b/crates/pica-format/tests/integration.rs new file mode 100644 index 000000000..cb8ced61b --- /dev/null +++ b/crates/pica-format/tests/integration.rs @@ -0,0 +1,40 @@ +use std::str::FromStr; +use std::sync::OnceLock; + +use pica_format::{Format, FormatExt}; +use pica_record::ByteRecord; + +fn ada_lovelace() -> &'static [u8] { + use std::path::Path; + use std::{env, fs}; + + static DATA: OnceLock> = OnceLock::new(); + DATA.get_or_init(|| { + let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + let path = Path::new(&manifest_dir) + .join("../pica-toolkit/tests/data/119232022.dat"); + fs::read_to_string(&path).unwrap().as_bytes().to_vec() + }) +} + +#[test] +fn test_format() -> anyhow::Result<()> { + let ada = ByteRecord::from_bytes(ada_lovelace()).expect("record"); + let fmt = Format::from_str("028A{ a <$> (', ' d <*> ' ' c) }")?; + let result = ada.format(&fmt, &Default::default()); + assert_eq!(result, vec!["Lovelace, Ada King of".to_string()]); + + Ok(()) +} + +#[test] +fn test_format_predicate() -> anyhow::Result<()> { + let ada = ByteRecord::from_bytes(ada_lovelace()).expect("record"); + let fmt = Format::from_str( + "028[A@]{ a <$> (', ' d <*> ' ' c) | 4 == 'nafr'}", + )?; + let result = ada.format(&fmt, &Default::default()); + assert_eq!(result, vec!["Byron, Ada Augusta".to_string()]); + + Ok(()) +} diff --git a/crates/pica-format/tests/main.rs b/crates/pica-format/tests/main.rs new file mode 100644 index 000000000..6d3bbe604 --- /dev/null +++ b/crates/pica-format/tests/main.rs @@ -0,0 +1 @@ +mod integration;