From ab5333c60dad0fce905963e008a9c55660184cbd Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Sun, 7 Jan 2024 21:40:04 +0100 Subject: [PATCH] Add parsing for `DerivedNormalizationProps.txt` --- .../src/derived_normalization_properties.rs | 64 +++++++++++++++++++ ucd-parse/src/lib.rs | 2 + 2 files changed, 66 insertions(+) create mode 100644 ucd-parse/src/derived_normalization_properties.rs diff --git a/ucd-parse/src/derived_normalization_properties.rs b/ucd-parse/src/derived_normalization_properties.rs new file mode 100644 index 0000000..dd7f352 --- /dev/null +++ b/ucd-parse/src/derived_normalization_properties.rs @@ -0,0 +1,64 @@ +use std::path::Path; + +use crate::{ + common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, + }, + error::Error, +}; + +/// A single row in the `DerivedNormalizationProps.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct DerivedNormalizationProperty { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for DerivedNormalizationProperty { + fn relative_file_path() -> &'static Path { + Path::new("DerivedNormalizationProps.txt") + } +} + +impl UcdFileByCodepoint for DerivedNormalizationProperty { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl std::str::FromStr for DerivedNormalizationProperty { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(DerivedNormalizationProperty { + codepoints, + property: property.to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::DerivedNormalizationProperty; + + #[test] + fn parse_single() { + let line = + "00A0 ; Changes_When_NFKC_Casefolded # Zs NO-BREAK SPACE\n"; + let row: DerivedNormalizationProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, 0xA0); + assert_eq!(row.property, "Changes_When_NFKC_Casefolded"); + } + + #[test] + fn parse_range() { + let line = "0041..005A ; Changes_When_NFKC_Casefolded # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z\n"; + let row: DerivedNormalizationProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x41, 0x5A)); + assert_eq!(row.property, "Changes_When_NFKC_Casefolded"); + } +} diff --git a/ucd-parse/src/lib.rs b/ucd-parse/src/lib.rs index 61a33b1..fbd301d 100644 --- a/ucd-parse/src/lib.rs +++ b/ucd-parse/src/lib.rs @@ -19,6 +19,7 @@ pub use crate::{ bidi_mirroring_glyph::BidiMirroring, case_folding::{CaseFold, CaseStatus}, core_properties::CoreProperty, + derived_normalization_properties::DerivedNormalizationProperty, east_asian_width::EastAsianWidth, emoji_properties::EmojiProperty, grapheme_cluster_break::{GraphemeClusterBreak, GraphemeClusterBreakTest}, @@ -65,6 +66,7 @@ mod arabic_shaping; mod bidi_mirroring_glyph; mod case_folding; mod core_properties; +mod derived_normalization_properties; mod east_asian_width; mod emoji_properties; mod grapheme_cluster_break;