Skip to content

Commit

Permalink
Add --squash and --merge option (#642)
Browse files Browse the repository at this point in the history
  • Loading branch information
nwagner84 authored Jul 6, 2023
1 parent ccfb6d6 commit c78fc2a
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

* #637 Stabilize `print` command
* #641 Stabilize `sample` command
* #642 Add `--squash` and `--merge` option

### Removed

Expand Down
133 changes: 124 additions & 9 deletions pica-select/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,41 @@ impl Outcome {
Self(vec![repeat("".to_string()).take(n).collect()])
}

pub fn squash(self, sep: &str) -> Self {
let flattened =
self.0.into_iter().flatten().collect::<Vec<String>>();

if flattened.len() > 1
&& !sep.is_empty()
&& flattened.iter().any(|item| item.contains(sep))
{
eprintln!(
"WARNING: A subfield value contains \
squash separator '{}'.",
sep
);
}

Self(vec![vec![flattened.join(sep)]])
}

pub fn merge(self, sep: &str) -> Self {
let result = self.0.clone().into_iter().reduce(|acc, e| {
let mut result = Vec::new();

for i in 0..acc.len() {
let mut value = String::from(&acc[i]);
value.push_str(sep);
value.push_str(&e[i]);
result.push(value)
}

result
});

Self(vec![result.unwrap()])
}

pub fn into_inner(self) -> Vec<Vec<String>> {
self.0
}
Expand Down Expand Up @@ -303,9 +338,75 @@ impl Mul for Outcome {
}
}

/// Options and flags which can be used to configure a matcher.
#[derive(Debug)]
pub struct QueryOptions {
pub case_ignore: bool,
pub strsim_threshold: f64,
pub separator: String,
pub squash: bool,
pub merge: bool,
}

impl Default for QueryOptions {
fn default() -> Self {
Self {
case_ignore: false,
strsim_threshold: 0.8,
separator: "|".into(),
squash: false,
merge: false,
}
}
}

impl QueryOptions {
/// Create new matcher flags.
pub fn new() -> Self {
Self::default()
}

/// Whether to ignore case when comparing strings or not.
pub fn case_ignore(mut self, yes: bool) -> Self {
self.case_ignore = yes;
self
}

/// Set the similarity threshold for the similar operator (`=*`).
pub fn strsim_threshold(mut self, threshold: f64) -> Self {
self.strsim_threshold = threshold;
self
}

/// Whether to squash subfield values or not.
pub fn squash(mut self, yes: bool) -> Self {
self.squash = yes;
self
}

/// Whether to merge repeated fields or not.
pub fn merge(mut self, yes: bool) -> Self {
self.merge = yes;
self
}

/// Set the squash or merge separator.
pub fn separator<S: Into<String>>(mut self, sep: S) -> Self {
self.separator = sep.into();
self
}
}

impl From<&QueryOptions> for MatcherOptions {
fn from(options: &QueryOptions) -> Self {
Self::new()
.strsim_threshold(options.strsim_threshold)
.case_ignore(options.case_ignore)
}
}

pub trait QueryExt {
fn query(&self, query: &Query, options: &MatcherOptions)
-> Outcome;
fn query(&self, query: &Query, options: &QueryOptions) -> Outcome;
}

impl<T: AsRef<[u8]> + Debug + Display> QueryExt for Record<T> {
Expand Down Expand Up @@ -339,11 +440,7 @@ impl<T: AsRef<[u8]> + Debug + Display> QueryExt for Record<T> {
/// Ok(())
/// }
/// ```
fn query(
&self,
query: &Query,
options: &MatcherOptions,
) -> Outcome {
fn query(&self, query: &Query, options: &QueryOptions) -> Outcome {
let mut outcomes = vec![];

for fragment in query.iter() {
Expand All @@ -361,7 +458,10 @@ impl<T: AsRef<[u8]> + Debug + Display> QueryExt for Record<T> {
})
.filter(|field| {
if let Some(m) = path.subfield_matcher() {
m.is_match(field.subfields(), options)
m.is_match(
field.subfields(),
&options.into(),
)
} else {
true
}
Expand All @@ -388,6 +488,14 @@ impl<T: AsRef<[u8]> + Debug + Display> QueryExt for Record<T> {
Outcome::one()
}
})
.map(|outcome| {
if options.squash {
outcome
.squash(&options.separator)
} else {
outcome
}
})
.fold(Outcome::default(), |acc, e| {
acc * e
})
Expand All @@ -407,6 +515,13 @@ impl<T: AsRef<[u8]> + Debug + Display> QueryExt for Record<T> {

outcomes
.into_iter()
.map(|outcome| {
if options.merge {
outcome.merge(&options.separator)
} else {
outcome
}
})
.reduce(|acc, e| acc * e)
.unwrap_or_default()
}
Expand Down Expand Up @@ -521,7 +636,7 @@ mod tests {

#[test]
fn test_query() -> anyhow::Result<()> {
let options = MatcherOptions::default();
let options = QueryOptions::default();

let record =
RecordRef::new(vec![("012A", None, vec![('a', "1")])]);
Expand Down
29 changes: 25 additions & 4 deletions src/bin/pica/commands/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use std::str::FromStr;
use clap::Parser;
use pica_matcher::{MatcherOptions, RecordMatcher};
use pica_record::io::{ReaderBuilder, RecordsIterator};
use pica_select::{Query, QueryExt};
use pica_select::{Query, QueryExt, QueryOptions};
use serde::{Deserialize, Serialize};

use crate::common::FilterList;
Expand All @@ -31,6 +31,21 @@ pub(crate) struct Select {
#[arg(short, long)]
skip_invalid: bool,

/// Whether to squash all values of a repeated subfield into a
/// single value or not. The separator can be specified by the
/// `--separator` option.
#[arg(long)]
squash: bool,

#[arg(long)]
merge: bool,

/// Sets the separator used for squashing of repeated subfield
/// values into a single value. Note that it's possible to use the
/// empty string as a separator.
#[arg(long, default_value = "|")]
separator: String,

/// Disallow empty columns
#[arg(long)]
no_empty_columns: bool,
Expand Down Expand Up @@ -157,8 +172,11 @@ impl Select {
None
};

let options =
MatcherOptions::default().case_ignore(self.ignore_case);
let options = QueryOptions::default()
.case_ignore(self.ignore_case)
.separator(self.separator)
.squash(self.squash)
.merge(self.merge);

let matcher = if let Some(matcher_str) = self.filter {
let mut matcher = RecordMatcher::new(&translit_maybe2(
Expand Down Expand Up @@ -251,7 +269,10 @@ impl Select {
}

if let Some(ref matcher) = matcher {
if !matcher.is_match(&record, &options) {
if !matcher.is_match(
&record,
&MatcherOptions::from(&options),
) {
continue;
}
}
Expand Down
6 changes: 6 additions & 0 deletions tests/snapshot/select/042-select-squash.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
bin.name = "pica"
args = "select -s \"[email protected],012A{a, b}\" --squash"
status = "success"
stdin = "003@ \u001f0123456789X\u001e012A \u001fa123\u001fa456\u001fbabc\u001fbdef\u001fbhij\u001e\n"
stdout = "123456789X,123|456,abc|def|hij\n"
stderr = ""
6 changes: 6 additions & 0 deletions tests/snapshot/select/043-select-squash-sep.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
bin.name = "pica"
args = "select -s \"[email protected],[email protected]\" --squash --separator '+++'"
status = "success"
stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n"
stdout = "123456789X,ger+++eng\n123456789X,mul\n"
stderr = ""
6 changes: 6 additions & 0 deletions tests/snapshot/select/044-select-squash-sep-empty-string.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
bin.name = "pica"
args = "select -s \"[email protected],[email protected]\" --squash --separator ''"
status = "success"
stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n"
stdout = "123456789X,gereng\n123456789X,mul\n"
stderr = ""
6 changes: 6 additions & 0 deletions tests/snapshot/select/045-select-squash-sep-colon.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
bin.name = "pica"
args = "select -s \"[email protected],[email protected]\" --squash --separator ','"
status = "success"
stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n"
stdout = "123456789X,\"ger,eng\"\n123456789X,mul\n"
stderr = ""
6 changes: 6 additions & 0 deletions tests/snapshot/select/046-select-squash-warning.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
bin.name = "pica"
args = "select -s \"[email protected],[email protected]\" --squash --separator 'X'"
status = "success"
stderr = "WARNING: A subfield value contains squash separator 'X'.\n"
stdin = "003@ \u001f0123456789X\u001e010@ \u001faaXb\u001faeng\u001e010@ \u001famul\u001e\n"
stdout = "123456789X,aXbXeng\n123456789X,mul\n"
6 changes: 6 additions & 0 deletions tests/snapshot/select/047-select-squash-merge.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
bin.name = "pica"
args = "select -s \"[email protected],[email protected]\" --squash --merge"
status = "success"
stdin = "003@ \u001f0123456789X\u001e010@ \u001fager\u001faeng\u001e010@ \u001famul\u001e\n"
stdout = "123456789X,ger|eng|mul\n"
stderr = ""

0 comments on commit c78fc2a

Please sign in to comment.