From c78fc2aaffb19c2165bef319f95dbc88a804719a Mon Sep 17 00:00:00 2001 From: Nico Wagner Date: Thu, 6 Jul 2023 11:33:05 +0200 Subject: [PATCH] Add `--squash` and `--merge` option (#642) --- CHANGELOG.md | 1 + pica-select/src/lib.rs | 133 ++++++++++++++++-- src/bin/pica/commands/select.rs | 29 +++- tests/snapshot/select/042-select-squash.toml | 6 + .../select/043-select-squash-sep.toml | 6 + .../044-select-squash-sep-empty-string.toml | 6 + .../select/045-select-squash-sep-colon.toml | 6 + .../select/046-select-squash-warning.toml | 6 + .../select/047-select-squash-merge.toml | 6 + 9 files changed, 186 insertions(+), 13 deletions(-) create mode 100644 tests/snapshot/select/042-select-squash.toml create mode 100644 tests/snapshot/select/043-select-squash-sep.toml create mode 100644 tests/snapshot/select/044-select-squash-sep-empty-string.toml create mode 100644 tests/snapshot/select/045-select-squash-sep-colon.toml create mode 100644 tests/snapshot/select/046-select-squash-warning.toml create mode 100644 tests/snapshot/select/047-select-squash-merge.toml diff --git a/CHANGELOG.md b/CHANGELOG.md index 18a9fb84d..aee25389d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * #637 Stabilize `print` command * #641 Stabilize `sample` command +* #642 Add `--squash` and `--merge` option ### Removed diff --git a/pica-select/src/lib.rs b/pica-select/src/lib.rs index 9e4c44010..35b12d49c 100644 --- a/pica-select/src/lib.rs +++ b/pica-select/src/lib.rs @@ -246,6 +246,41 @@ impl Outcome { Self(vec![repeat("".to_string()).take(n).collect()]) } + pub fn squash(self, sep: &str) -> Self { + let flattened = + self.0.into_iter().flatten().collect::>(); + + if flattened.len() > 1 + && !sep.is_empty() + && flattened.iter().any(|item| item.contains(sep)) + { + eprintln!( + "WARNING: A subfield value contains \ + squash separator '{}'.", + sep + ); + } + + Self(vec![vec![flattened.join(sep)]]) + } + + pub fn merge(self, sep: &str) -> Self { + let result = self.0.clone().into_iter().reduce(|acc, e| { + let mut result = Vec::new(); + + for i in 0..acc.len() { + let mut value = String::from(&acc[i]); + value.push_str(sep); + value.push_str(&e[i]); + result.push(value) + } + + result + }); + + Self(vec![result.unwrap()]) + } + pub fn into_inner(self) -> Vec> { self.0 } @@ -303,9 +338,75 @@ impl Mul for Outcome { } } +/// Options and flags which can be used to configure a matcher. +#[derive(Debug)] +pub struct QueryOptions { + pub case_ignore: bool, + pub strsim_threshold: f64, + pub separator: String, + pub squash: bool, + pub merge: bool, +} + +impl Default for QueryOptions { + fn default() -> Self { + Self { + case_ignore: false, + strsim_threshold: 0.8, + separator: "|".into(), + squash: false, + merge: false, + } + } +} + +impl QueryOptions { + /// Create new matcher flags. + pub fn new() -> Self { + Self::default() + } + + /// Whether to ignore case when comparing strings or not. + pub fn case_ignore(mut self, yes: bool) -> Self { + self.case_ignore = yes; + self + } + + /// Set the similarity threshold for the similar operator (`=*`). + pub fn strsim_threshold(mut self, threshold: f64) -> Self { + self.strsim_threshold = threshold; + self + } + + /// Whether to squash subfield values or not. + pub fn squash(mut self, yes: bool) -> Self { + self.squash = yes; + self + } + + /// Whether to merge repeated fields or not. + pub fn merge(mut self, yes: bool) -> Self { + self.merge = yes; + self + } + + /// Set the squash or merge separator. + pub fn separator>(mut self, sep: S) -> Self { + self.separator = sep.into(); + self + } +} + +impl From<&QueryOptions> for MatcherOptions { + fn from(options: &QueryOptions) -> Self { + Self::new() + .strsim_threshold(options.strsim_threshold) + .case_ignore(options.case_ignore) + } +} + pub trait QueryExt { - fn query(&self, query: &Query, options: &MatcherOptions) - -> Outcome; + fn query(&self, query: &Query, options: &QueryOptions) -> Outcome; } impl + Debug + Display> QueryExt for Record { @@ -339,11 +440,7 @@ impl + Debug + Display> QueryExt for Record { /// Ok(()) /// } /// ``` - fn query( - &self, - query: &Query, - options: &MatcherOptions, - ) -> Outcome { + fn query(&self, query: &Query, options: &QueryOptions) -> Outcome { let mut outcomes = vec![]; for fragment in query.iter() { @@ -361,7 +458,10 @@ impl + Debug + Display> QueryExt for Record { }) .filter(|field| { if let Some(m) = path.subfield_matcher() { - m.is_match(field.subfields(), options) + m.is_match( + field.subfields(), + &options.into(), + ) } else { true } @@ -388,6 +488,14 @@ impl + Debug + Display> QueryExt for Record { Outcome::one() } }) + .map(|outcome| { + if options.squash { + outcome + .squash(&options.separator) + } else { + outcome + } + }) .fold(Outcome::default(), |acc, e| { acc * e }) @@ -407,6 +515,13 @@ impl + Debug + Display> QueryExt for Record { outcomes .into_iter() + .map(|outcome| { + if options.merge { + outcome.merge(&options.separator) + } else { + outcome + } + }) .reduce(|acc, e| acc * e) .unwrap_or_default() } @@ -521,7 +636,7 @@ mod tests { #[test] fn test_query() -> anyhow::Result<()> { - let options = MatcherOptions::default(); + let options = QueryOptions::default(); let record = RecordRef::new(vec![("012A", None, vec![('a', "1")])]); diff --git a/src/bin/pica/commands/select.rs b/src/bin/pica/commands/select.rs index 6e82ffe36..ea149a6ab 100644 --- a/src/bin/pica/commands/select.rs +++ b/src/bin/pica/commands/select.rs @@ -10,7 +10,7 @@ use std::str::FromStr; use clap::Parser; use pica_matcher::{MatcherOptions, RecordMatcher}; use pica_record::io::{ReaderBuilder, RecordsIterator}; -use pica_select::{Query, QueryExt}; +use pica_select::{Query, QueryExt, QueryOptions}; use serde::{Deserialize, Serialize}; use crate::common::FilterList; @@ -31,6 +31,21 @@ pub(crate) struct Select { #[arg(short, long)] skip_invalid: bool, + /// Whether to squash all values of a repeated subfield into a + /// single value or not. The separator can be specified by the + /// `--separator` option. + #[arg(long)] + squash: bool, + + #[arg(long)] + merge: bool, + + /// Sets the separator used for squashing of repeated subfield + /// values into a single value. Note that it's possible to use the + /// empty string as a separator. + #[arg(long, default_value = "|")] + separator: String, + /// Disallow empty columns #[arg(long)] no_empty_columns: bool, @@ -157,8 +172,11 @@ impl Select { None }; - let options = - MatcherOptions::default().case_ignore(self.ignore_case); + let options = QueryOptions::default() + .case_ignore(self.ignore_case) + .separator(self.separator) + .squash(self.squash) + .merge(self.merge); let matcher = if let Some(matcher_str) = self.filter { let mut matcher = RecordMatcher::new(&translit_maybe2( @@ -251,7 +269,10 @@ impl Select { } if let Some(ref matcher) = matcher { - if !matcher.is_match(&record, &options) { + if !matcher.is_match( + &record, + &MatcherOptions::from(&options), + ) { continue; } } diff --git a/tests/snapshot/select/042-select-squash.toml b/tests/snapshot/select/042-select-squash.toml new file mode 100644 index 000000000..49afe2af7 --- /dev/null +++ b/tests/snapshot/select/042-select-squash.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "select -s \"003@.0,012A{a, b}\" --squash" +status = "success" +stdin = "003@ \u001f0123456789X\u001e012A \u001fa123\u001fa456\u001fbabc\u001fbdef\u001fbhij\u001e\n" +stdout = "123456789X,123|456,abc|def|hij\n" +stderr = "" diff --git a/tests/snapshot/select/043-select-squash-sep.toml b/tests/snapshot/select/043-select-squash-sep.toml new file mode 100644 index 000000000..097eeb5cb --- /dev/null +++ b/tests/snapshot/select/043-select-squash-sep.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "select -s \"003@.0,010@.a\" --squash --separator '+++'" +status = "success" +stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n" +stdout = "123456789X,ger+++eng\n123456789X,mul\n" +stderr = "" diff --git a/tests/snapshot/select/044-select-squash-sep-empty-string.toml b/tests/snapshot/select/044-select-squash-sep-empty-string.toml new file mode 100644 index 000000000..8ba1490f4 --- /dev/null +++ b/tests/snapshot/select/044-select-squash-sep-empty-string.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "select -s \"003@.0,010@.a\" --squash --separator ''" +status = "success" +stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n" +stdout = "123456789X,gereng\n123456789X,mul\n" +stderr = "" diff --git a/tests/snapshot/select/045-select-squash-sep-colon.toml b/tests/snapshot/select/045-select-squash-sep-colon.toml new file mode 100644 index 000000000..3dfc17cb6 --- /dev/null +++ b/tests/snapshot/select/045-select-squash-sep-colon.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "select -s \"003@.0,010@.a\" --squash --separator ','" +status = "success" +stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n" +stdout = "123456789X,\"ger,eng\"\n123456789X,mul\n" +stderr = "" diff --git a/tests/snapshot/select/046-select-squash-warning.toml b/tests/snapshot/select/046-select-squash-warning.toml new file mode 100644 index 000000000..d3b028a7b --- /dev/null +++ b/tests/snapshot/select/046-select-squash-warning.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "select -s \"003@.0,010@.a\" --squash --separator 'X'" +status = "success" +stderr = "WARNING: A subfield value contains squash separator 'X'.\n" +stdin = "003@ \u001f0123456789X\u001e010@ \u001faaXb\u001faeng\u001e010@ \u001famul\u001e\n" +stdout = "123456789X,aXbXeng\n123456789X,mul\n" diff --git a/tests/snapshot/select/047-select-squash-merge.toml b/tests/snapshot/select/047-select-squash-merge.toml new file mode 100644 index 000000000..2cac11cac --- /dev/null +++ b/tests/snapshot/select/047-select-squash-merge.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "select -s \"003@.0,010@.a\" --squash --merge" +status = "success" +stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n" +stdout = "123456789X,ger|eng|mul\n" +stderr = ""