From 4a395e7c28e95a7e72705271a9ada439806b83b9 Mon Sep 17 00:00:00 2001 From: Nico Wagner Date: Mon, 2 Sep 2024 07:54:57 +0000 Subject: [PATCH] Wrap code in `SubfieldCode` Signed-off-by: Nico Wagner --- crates/pica-format/src/lib.rs | 4 +- crates/pica-format/src/parse.rs | 7 +- crates/pica-matcher/src/subfield_matcher.rs | 190 ++++++++++++++---- crates/pica-path/src/lib.rs | 35 +++- .../src/commands/convert/plain.rs | 2 +- pica-record/src/error.rs | 5 + pica-record/src/field.rs | 2 +- pica-record/src/lib.rs | 1 + pica-record/src/subfield.rs | 151 ++++++++++++-- 9 files changed, 320 insertions(+), 77 deletions(-) diff --git a/crates/pica-format/src/lib.rs b/crates/pica-format/src/lib.rs index 3c8c3c7ff..27a18de87 100644 --- a/crates/pica-format/src/lib.rs +++ b/crates/pica-format/src/lib.rs @@ -2,7 +2,7 @@ use std::ops::RangeTo; use std::str::FromStr; use pica_matcher::{OccurrenceMatcher, SubfieldMatcher, TagMatcher}; -use pica_record::{FieldRef, RecordRef, SubfieldRef}; +use pica_record::{FieldRef, RecordRef, SubfieldCode, SubfieldRef}; use thiserror::Error; use winnow::prelude::*; @@ -123,7 +123,7 @@ impl Formatter for Fragments { #[derive(Debug, Clone, PartialEq)] struct Value { - codes: Vec, + codes: Vec, prefix: Option, suffix: Option, bounds: RangeTo, diff --git a/crates/pica-format/src/parse.rs b/crates/pica-format/src/parse.rs index a4749a129..32fe0e788 100644 --- a/crates/pica-format/src/parse.rs +++ b/crates/pica-format/src/parse.rs @@ -5,6 +5,7 @@ use bstr::ByteSlice; use pica_matcher::parser::{ parse_occurrence_matcher, parse_subfield_matcher, parse_tag_matcher, }; +use pica_record::SubfieldCode; use winnow::ascii::{digit1, multispace0, multispace1}; use winnow::combinator::{ alt, delimited, empty, opt, preceded, repeat, separated, terminated, @@ -180,14 +181,14 @@ fn parse_list_and_then(i: &mut &[u8]) -> PResult { } /// Parses a subfield code (a single alpha-numeric character) -fn parse_code(i: &mut &[u8]) -> PResult { +fn parse_code(i: &mut &[u8]) -> PResult { one_of(('0'..='9', 'a'..='z', 'A'..='Z')) - .map(char::from) + .map(SubfieldCode::from_unchecked) .parse_next(i) } /// Parses a sequence of subfield codes. -fn parse_codes(i: &mut &[u8]) -> PResult> { +fn parse_codes(i: &mut &[u8]) -> PResult> { alt(( parse_code.map(|code| vec![code]), delimited(ws('['), repeat(2.., parse_code), ws(']')), diff --git a/crates/pica-matcher/src/subfield_matcher.rs b/crates/pica-matcher/src/subfield_matcher.rs index 838fc5e15..258540eb5 100644 --- a/crates/pica-matcher/src/subfield_matcher.rs +++ b/crates/pica-matcher/src/subfield_matcher.rs @@ -6,7 +6,7 @@ use std::str::FromStr; use bstr::ByteSlice; use pica_record::parser::parse_subfield_code; -use pica_record::SubfieldRef; +use pica_record::{SubfieldCode, SubfieldRef}; use regex::bytes::{Regex, RegexBuilder}; use strsim::normalized_levenshtein; use winnow::ascii::digit1; @@ -31,27 +31,37 @@ use crate::{MatcherOptions, ParseMatcherError}; /// is contained in the matcher's code list. #[derive(Clone, Debug, PartialEq, Eq)] pub struct ExistsMatcher { - codes: Vec, + codes: Vec, } const SUBFIELD_CODES: &str = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; #[inline] -fn parse_subfield_code_range(i: &mut &[u8]) -> PResult> { +fn parse_subfield_code_range( + i: &mut &[u8], +) -> PResult> { separated_pair(parse_subfield_code, '-', parse_subfield_code) .verify(|(min, max)| min < max) - .map(|(min, max)| (min..=max).collect()) + .map(|(min, max)| { + (min.as_byte()..=max.as_byte()) + .map(|code| SubfieldCode::from_unchecked(code)) + .collect() + }) .parse_next(i) } #[inline] -fn parse_subfield_code_single(i: &mut &[u8]) -> PResult> { +fn parse_subfield_code_single( + i: &mut &[u8], +) -> PResult> { parse_subfield_code.map(|code| vec![code]).parse_next(i) } #[inline] -fn parse_subfield_code_list(i: &mut &[u8]) -> PResult> { +fn parse_subfield_code_list( + i: &mut &[u8], +) -> PResult> { delimited( '[', repeat( @@ -71,12 +81,20 @@ fn parse_subfield_code_list(i: &mut &[u8]) -> PResult> { } #[inline] -fn parse_subfield_code_wildcard(i: &mut &[u8]) -> PResult> { - '*'.value(SUBFIELD_CODES.chars().collect()).parse_next(i) +fn parse_subfield_code_wildcard( + i: &mut &[u8], +) -> PResult> { + '*'.value( + SUBFIELD_CODES + .chars() + .map(|code| SubfieldCode::new(code).unwrap()) + .collect(), + ) + .parse_next(i) } /// Parse a list of subfield codes -fn parse_subfield_codes(i: &mut &[u8]) -> PResult> { +fn parse_subfield_codes(i: &mut &[u8]) -> PResult> { alt(( parse_subfield_code_list, parse_subfield_code_single, @@ -119,9 +137,11 @@ impl ExistsMatcher { /// } /// ``` pub fn new>>(codes: T) -> Self { - let codes = codes.into(); - - assert!(codes.iter().all(char::is_ascii_alphanumeric)); + let codes = codes + .into() + .into_iter() + .map(|code| SubfieldCode::new(code).unwrap()) + .collect(); Self { codes } } @@ -190,7 +210,7 @@ impl FromStr for ExistsMatcher { #[derive(Clone, Debug, PartialEq, Eq)] pub struct RelationMatcher { quantifier: Quantifier, - codes: Vec, + codes: Vec, op: RelationalOp, value: Vec, } @@ -392,7 +412,7 @@ impl FromStr for RelationMatcher { #[derive(PartialEq, Clone, Debug)] pub struct RegexMatcher { quantifier: Quantifier, - codes: Vec, + codes: Vec, re: String, invert: bool, } @@ -434,8 +454,11 @@ impl RegexMatcher { S: Into, T: Into>, { - let codes = codes.into(); - assert!(codes.iter().all(char::is_ascii_alphanumeric)); + let codes = codes + .into() + .into_iter() + .map(|code| SubfieldCode::new(code).unwrap()) + .collect(); let re = re.into(); assert!(RegexBuilder::new(&re).build().is_ok()); @@ -524,7 +547,7 @@ impl FromStr for RegexMatcher { #[derive(Clone, Debug, PartialEq, Eq)] pub struct InMatcher { quantifier: Quantifier, - codes: Vec, + codes: Vec, values: Vec>, invert: bool, } @@ -584,7 +607,10 @@ impl InMatcher { .map(|s| s.as_ref().to_vec()) .collect::>(); - assert!(codes.iter().all(char::is_ascii_alphanumeric)); + let codes = codes + .into_iter() + .map(|code| SubfieldCode::new(code).unwrap()) + .collect(); Self { quantifier, @@ -675,7 +701,7 @@ impl FromStr for InMatcher { /// A matcher that checks the number of occurrences of a subfield. #[derive(Clone, Debug, PartialEq, Eq)] pub struct CardinalityMatcher { - code: char, + code: SubfieldCode, op: RelationalOp, value: usize, } @@ -726,7 +752,11 @@ impl CardinalityMatcher { assert!(code.is_ascii_alphanumeric()); assert!(op.is_usize_applicable()); - Self { code, op, value } + Self { + code: SubfieldCode::new(code).unwrap(), + op, + value, + } } /// Returns true of number of fields with a code equal to the @@ -739,7 +769,7 @@ impl CardinalityMatcher { ) -> bool { let count = subfields .into_iter() - .filter(|&s| self.code == s.code()) + .filter(|&s| self.code == *s.code()) .count(); match self.op { @@ -1174,6 +1204,8 @@ impl BitXor for SubfieldMatcher { mod tests { use super::*; + type TestResult = anyhow::Result<()>; + #[test] fn parse_subfield_codes() { let codes = SUBFIELD_CODES.chars().collect::>(); @@ -1206,7 +1238,7 @@ mod tests { } #[test] - fn parse_exists_matcher() { + fn parse_exists_matcher() -> TestResult { macro_rules! parse_success { ($input:expr, $codes:expr) => { assert_eq!( @@ -1216,17 +1248,46 @@ mod tests { }; } - parse_success!(b"*?", SUBFIELD_CODES.chars().collect()); - parse_success!(b"[a-f]?", vec!['a', 'b', 'c', 'd', 'e', 'f']); - parse_success!(b"[a-cf]?", vec!['a', 'b', 'c', 'f']); - parse_success!(b"[ab]?", vec!['a', 'b']); - parse_success!(b"a?", vec!['a']); + parse_success!( + b"*?", + SUBFIELD_CODES + .chars() + .map(|code| SubfieldCode::new(code).unwrap()) + .collect() + ); + parse_success!( + b"[a-f]?", + vec![ + 'a'.try_into()?, + 'b'.try_into()?, + 'c'.try_into()?, + 'd'.try_into()?, + 'e'.try_into()?, + 'f'.try_into()? + ] + ); + parse_success!( + b"[a-cf]?", + vec![ + 'a'.try_into()?, + 'b'.try_into()?, + 'c'.try_into()?, + 'f'.try_into()? + ] + ); + parse_success!( + b"[ab]?", + vec!['a'.try_into()?, 'b'.try_into()?] + ); + parse_success!(b"a?", vec!['a'.try_into()?]); assert!(super::parse_exists_matcher.parse(b"a ?").is_err()); + + Ok(()) } #[test] - fn parse_relation_matcher() { + fn parse_relation_matcher() -> TestResult { use Quantifier::*; use RelationalOp::*; @@ -1246,41 +1307,73 @@ mod tests { }; } - parse_success!(b"0 == 'abc'", Any, vec!['0'], Eq, b"abc"); - parse_success!(b"0 != 'abc'", Any, vec!['0'], Ne, b"abc"); + parse_success!( + b"0 == 'abc'", + Any, + vec!['0'.try_into()?], + Eq, + b"abc" + ); + parse_success!( + b"0 != 'abc'", + Any, + vec!['0'.try_into()?], + Ne, + b"abc" + ); parse_success!( b"0 =^ 'abc'", Any, - vec!['0'], + vec!['0'.try_into()?], StartsWith, b"abc" ); parse_success!( b"0 !^ 'abc'", Any, - vec!['0'], + vec!['0'.try_into()?], StartsNotWith, b"abc" ); - parse_success!(b"0 =$ 'abc'", Any, vec!['0'], EndsWith, b"abc"); + parse_success!( + b"0 =$ 'abc'", + Any, + vec!['0'.try_into()?], + EndsWith, + b"abc" + ); parse_success!( b"0 !$ 'abc'", Any, - vec!['0'], + vec!['0'.try_into()?], EndsNotWith, b"abc" ); - parse_success!(b"0 =* 'abc'", Any, vec!['0'], Similar, b"abc"); - parse_success!(b"0 =? 'abc'", Any, vec!['0'], Contains, b"abc"); + parse_success!( + b"0 =* 'abc'", + Any, + vec!['0'.try_into()?], + Similar, + b"abc" + ); + parse_success!( + b"0 =? 'abc'", + Any, + vec!['0'.try_into()?], + Contains, + b"abc" + ); assert!(parse_relation_matcher.parse(b"0 >= 'abc'").is_err()); assert!(parse_relation_matcher.parse(b"0 > 'abc'").is_err()); assert!(parse_relation_matcher.parse(b"0 <= 'abc'").is_err()); assert!(parse_relation_matcher.parse(b"0 < 'abc'").is_err()); + + Ok(()) } #[test] - fn parse_regex_matcher() { + fn parse_regex_matcher() -> TestResult { use super::parse_regex_matcher; macro_rules! parse_success { @@ -1297,11 +1390,28 @@ mod tests { }; } - parse_success!(b"0 =~ '^Tp'", vec!['0'], "^Tp", false); - parse_success!(b"0 !~ '^Tp'", vec!['0'], "^Tp", true); - parse_success!(b"[ab] =~ 'foo'", vec!['a', 'b'], "foo", false); + parse_success!( + b"0 =~ '^Tp'", + vec!['0'.try_into()?], + "^Tp", + false + ); + parse_success!( + b"0 !~ '^Tp'", + vec!['0'.try_into()?], + "^Tp", + true + ); + parse_success!( + b"[ab] =~ 'foo'", + vec!['a'.try_into()?, 'b'.try_into()?], + "foo", + false + ); assert!(parse_regex_matcher.parse(b"0 =~ '[[ab]'").is_err()); assert!(parse_regex_matcher.parse(b"0 !~ '[[ab]'").is_err()); + + Ok(()) } } diff --git a/crates/pica-path/src/lib.rs b/crates/pica-path/src/lib.rs index 851e0879d..7b996fc18 100644 --- a/crates/pica-path/src/lib.rs +++ b/crates/pica-path/src/lib.rs @@ -9,7 +9,7 @@ use pica_matcher::{ MatcherOptions, OccurrenceMatcher, SubfieldMatcher, TagMatcher, }; use pica_record::parser::parse_subfield_code; -use pica_record::{FieldRef, RecordRef}; +use pica_record::{FieldRef, RecordRef, SubfieldCode}; #[cfg(feature = "serde")] use serde::Deserialize; use thiserror::Error; @@ -33,7 +33,7 @@ pub struct Path { tag_matcher: TagMatcher, occurrence_matcher: OccurrenceMatcher, subfield_matcher: Option, - codes: Vec>, + codes: Vec>, } impl Path { @@ -72,7 +72,7 @@ impl Path { /// Ok(()) /// } /// ``` - pub fn codes(&self) -> &Vec> { + pub fn codes(&self) -> &Vec> { &self.codes } @@ -90,7 +90,7 @@ impl Path { /// Ok(()) /// } /// ``` - pub fn codes_flat(&self) -> Vec { + pub fn codes_flat(&self) -> Vec { self.codes.clone().into_iter().flatten().collect() } @@ -148,20 +148,30 @@ where } #[inline] -fn parse_subfield_code_range(i: &mut &[u8]) -> PResult> { +fn parse_subfield_code_range( + i: &mut &[u8], +) -> PResult> { separated_pair(parse_subfield_code, '-', parse_subfield_code) .verify(|(min, max)| min < max) - .map(|(min, max)| (min..=max).collect()) + .map(|(min, max)| { + (min.as_byte()..=max.as_byte()) + .map(|code| SubfieldCode::from_unchecked(code)) + .collect() + }) .parse_next(i) } #[inline] -fn parse_subfield_code_single(i: &mut &[u8]) -> PResult> { +fn parse_subfield_code_single( + i: &mut &[u8], +) -> PResult> { parse_subfield_code.map(|code| vec![code]).parse_next(i) } #[inline] -fn parse_subfield_code_list(i: &mut &[u8]) -> PResult> { +fn parse_subfield_code_list( + i: &mut &[u8], +) -> PResult> { delimited( '[', repeat( @@ -181,11 +191,16 @@ fn parse_subfield_code_list(i: &mut &[u8]) -> PResult> { } #[inline] -fn parse_subfield_codes(i: &mut &[u8]) -> PResult> { +fn parse_subfield_codes(i: &mut &[u8]) -> PResult> { alt(( parse_subfield_code_list, parse_subfield_code_single, - '*'.value(SUBFIELD_CODES.chars().collect()), + '*'.value( + SUBFIELD_CODES + .chars() + .map(|code| SubfieldCode::new(code).unwrap()) + .collect(), + ), )) .parse_next(i) } diff --git a/crates/pica-toolkit/src/commands/convert/plain.rs b/crates/pica-toolkit/src/commands/convert/plain.rs index b131f8642..c13981ca1 100644 --- a/crates/pica-toolkit/src/commands/convert/plain.rs +++ b/crates/pica-toolkit/src/commands/convert/plain.rs @@ -38,7 +38,7 @@ impl ByteRecordWrite for PlainWriter { for subfield in field.subfields() { self.writer - .write_all(&[b'$', subfield.code() as u8])?; + .write_all(&[b'$', subfield.code().as_byte()])?; self.writer.write_all( &subfield.value().replace(b"$", b"$$"), )?; diff --git a/pica-record/src/error.rs b/pica-record/src/error.rs index fd1235a1f..c3ffda5a1 100644 --- a/pica-record/src/error.rs +++ b/pica-record/src/error.rs @@ -1,6 +1,11 @@ use thiserror::Error; /// An error that can occur when ceparsing PICA+ records. +#[derive(Error, PartialEq, Eq, Debug)] +pub enum PicaError { + #[error("'{0}' is not a valid subfield code.")] + InvalidSubfieldCode(char), +} /// -----{ TODO }----------------------------------------- /// An error that can occur when parsing PICA+ records. diff --git a/pica-record/src/field.rs b/pica-record/src/field.rs index 6b9ffb51d..5ba77f446 100644 --- a/pica-record/src/field.rs +++ b/pica-record/src/field.rs @@ -173,7 +173,7 @@ impl<'a> FieldRef<'a> { pub fn contains(&self, code: char) -> bool { self.subfields .iter() - .any(|subfield| subfield.code() == code) + .any(|subfield| *subfield.code() == code) } /// Searches for the first subfield that satisfies the given diff --git a/pica-record/src/lib.rs b/pica-record/src/lib.rs index 01c9a2591..881869eb6 100644 --- a/pica-record/src/lib.rs +++ b/pica-record/src/lib.rs @@ -23,6 +23,7 @@ //! parser combinators, which are used in upstream crates (matcher, //! select). +pub use error::PicaError; pub use subfield::SubfieldCode; /// -----{ TODO }----------------------------------------- diff --git a/pica-record/src/subfield.rs b/pica-record/src/subfield.rs index e8c71dfef..f43035aae 100644 --- a/pica-record/src/subfield.rs +++ b/pica-record/src/subfield.rs @@ -1,9 +1,127 @@ //! This module contains types and functions to work with PICA+ //! subfields. -#[derive(Debug, Clone, PartialEq)] +use std::fmt::{self, Display}; + +use crate::PicaError; + +/// A PICA+ subfield code. +/// +/// This type behaves like `char` but guarantees that the subfield code +/// is an ASCII alpha-numeric character. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub struct SubfieldCode(char); +impl SubfieldCode { + /// Creates a new subfield code. + /// + /// # Error + /// + /// This functions fails if the given code is not an ASCII + /// alpha-numeric character. + /// + /// # Example + /// + /// ```rust + /// use pica_record::SubfieldCode; + /// + /// let code = SubfieldCode::new('a')?; + /// assert_eq!(code, 'a'); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn new(code: char) -> Result { + if !code.is_ascii_alphanumeric() { + return Err(PicaError::InvalidSubfieldCode(code)); + } + + Ok(Self(code)) + } + + /// Creates a subfied code without checking for validity. + /// + /// # Safety + /// + /// The caller *must* ensure that the given subfield code is valid. + /// + /// # Example + /// + /// ```rust + /// use pica_record::SubfieldCode; + /// + /// let code = SubfieldCode::from_unchecked('a'); + /// assert_eq!(code, 'a'); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn from_unchecked>(code: T) -> Self { + Self(code.into()) + } + + /// Returns the subfield code as a byte (`u8`). + /// + /// # Example + /// + /// ```rust + /// use pica_record::SubfieldCode; + /// + /// let code = SubfieldCode::new('a')?; + /// assert_eq!(code.as_byte(), b'a'); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn as_byte(&self) -> u8 { + self.0 as u8 + } +} + +impl Display for SubfieldCode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +impl PartialEq for SubfieldCode { + fn eq(&self, code: &char) -> bool { + self.0 == *code + } +} + +impl PartialEq for &SubfieldCode { + fn eq(&self, code: &char) -> bool { + self.0 == *code + } +} + +impl TryFrom for SubfieldCode { + type Error = PicaError; + + fn try_from(code: char) -> Result { + Self::new(code) + } +} + +#[cfg(feature = "arbitrary")] +impl quickcheck::Arbitrary for SubfieldCode { + fn arbitrary(g: &mut quickcheck::Gen) -> Self { + let code = (1..) + .map(|_| char::arbitrary(g)) + .find(char::is_ascii_alphanumeric) + .unwrap(); + + Self(code) + } +} + +/// Parse a PICA+ subfield code. +pub fn parse_subfield_code(i: &mut &[u8]) -> PResult { + one_of((b'0'..=b'9', b'a'..=b'z', b'A'..=b'Z')) + .map(SubfieldCode::from_unchecked) + .parse_next(i) +} + /// -----{ TODO }----------------------------------------- use std::io::{self, Write}; use std::iter; @@ -19,24 +137,17 @@ use crate::error::ParsePicaError; /// An immutable PICA+ subfield. #[derive(Debug, Clone, PartialEq, Eq)] pub struct SubfieldRef<'a> { - code: char, + code: SubfieldCode, value: &'a BStr, } /// A mutable PICA+ subfield. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Subfield { - code: char, + code: SubfieldCode, value: BString, } -/// Parse a PICA+ subfield code. -pub fn parse_subfield_code(i: &mut &[u8]) -> PResult { - one_of((b'0'..=b'9', b'a'..=b'z', b'A'..=b'Z')) - .map(char::from) - .parse_next(i) -} - /// Parse a PICA+ subfield value. #[inline] fn parse_subfield_value<'a>(i: &mut &'a [u8]) -> PResult<&'a BStr> { @@ -124,8 +235,8 @@ impl<'a> SubfieldRef<'a> { /// Ok(()) /// } /// ``` - pub fn code(&self) -> char { - self.code + pub fn code(&self) -> &SubfieldCode { + &self.code } /// Returns the value of the subfield. @@ -270,7 +381,10 @@ where return Err(ParsePicaError::InvalidSubfield); } - Ok(Self { code, value }) + Ok(Self { + code: SubfieldCode(code), + value, + }) } } @@ -332,16 +446,13 @@ impl From> for Subfield { #[cfg(feature = "arbitrary")] impl quickcheck::Arbitrary for Subfield { fn arbitrary(g: &mut quickcheck::Gen) -> Self { - let code = (1..) - .map(|_| char::arbitrary(g)) - .filter(char::is_ascii_alphanumeric) - .nth(0) - .unwrap(); - let value = String::arbitrary(g).replace(['\x1f', '\x1e'], "").into(); - Self { code, value } + Self { + code: SubfieldCode::arbitrary(g), + value, + } } }