diff --git a/crates/pica-toolkit/src/commands/frequency.rs b/crates/pica-toolkit/src/commands/frequency.rs index bcd4b5281..e1bafd895 100644 --- a/crates/pica-toolkit/src/commands/frequency.rs +++ b/crates/pica-toolkit/src/commands/frequency.rs @@ -6,6 +6,7 @@ use std::io::{self, Write}; use std::str::FromStr; use clap::{value_parser, Parser}; +use pica_matcher::{MatcherBuilder, MatcherOptions}; use pica_record::io::{ReaderBuilder, RecordsIterator}; use pica_select::{Query, QueryExt, QueryOptions}; use pica_utils::NormalizationForm; @@ -78,6 +79,31 @@ pub(crate) struct Frequency { )] threshold: u64, + /// A filter expression used for searching + #[arg(long = "where")] + filter: Option, + + /// Connects the where clause with additional expressions using the + /// logical AND-operator (conjunction) + /// + /// This option can't be combined with `--or` or `--not`. + #[arg(long, requires = "filter", conflicts_with_all = ["or", "not"])] + and: Vec, + + /// Connects the where clause with additional expressions using the + /// logical OR-operator (disjunction) + /// + /// This option can't be combined with `--and` or `--not`. + #[arg(long, requires = "filter", conflicts_with_all = ["and", "not"])] + or: Vec, + + /// Connects the where clause with additional expressions using the + /// logical NOT-operator (negation) + /// + /// This option can't be combined with `--and` or `--or`. + #[arg(long, requires = "filter", conflicts_with_all = ["and", "or"])] + not: Vec, + /// Comma-separated list of column names. #[arg(long, short = 'H')] header: Option, @@ -126,6 +152,24 @@ impl Frequency { let query = Query::from_str(&query)?; + let nf = if let Some(ref global) = config.global { + global.translit + } else { + None + }; + + let matcher = if let Some(matcher) = self.filter { + Some( + MatcherBuilder::new(matcher, nf)? + .and(self.and)? + .not(self.not)? + .or(self.or)? + .build(), + ) + } else { + None + }; + let mut ftable: HashMap, u64> = HashMap::new(); let options = QueryOptions::new() .strsim_threshold(self.strsim_threshold as f64 / 100f64) @@ -158,6 +202,16 @@ impl Frequency { } let record = result.unwrap(); + + if let Some(ref matcher) = matcher { + if !matcher.is_match( + &record, + &MatcherOptions::from(&options), + ) { + continue; + } + } + progress.record(); seen.clear(); diff --git a/crates/pica-toolkit/tests/snapshot/frequency/030-frequency-where.in/dump.dat.gz b/crates/pica-toolkit/tests/snapshot/frequency/030-frequency-where.in/dump.dat.gz new file mode 100644 index 000000000..06c3230dd Binary files /dev/null and b/crates/pica-toolkit/tests/snapshot/frequency/030-frequency-where.in/dump.dat.gz differ diff --git a/crates/pica-toolkit/tests/snapshot/frequency/030-frequency-where.toml b/crates/pica-toolkit/tests/snapshot/frequency/030-frequency-where.toml new file mode 100644 index 000000000..774256156 --- /dev/null +++ b/crates/pica-toolkit/tests/snapshot/frequency/030-frequency-where.toml @@ -0,0 +1,5 @@ +bin.name = "pica" +args = "frequency -s \"002@.0\" --where \"002@.0 =^ 'Ts'\" dump.dat.gz" +status = "success" +stdout = "Ts1,2\nTsz,1\n" +stderr = "" diff --git a/crates/pica-toolkit/tests/snapshot/frequency/031-frequency-or.in/dump.dat.gz b/crates/pica-toolkit/tests/snapshot/frequency/031-frequency-or.in/dump.dat.gz new file mode 100644 index 000000000..06c3230dd Binary files /dev/null and b/crates/pica-toolkit/tests/snapshot/frequency/031-frequency-or.in/dump.dat.gz differ diff --git a/crates/pica-toolkit/tests/snapshot/frequency/031-frequency-or.toml b/crates/pica-toolkit/tests/snapshot/frequency/031-frequency-or.toml new file mode 100644 index 000000000..d9481bab2 --- /dev/null +++ b/crates/pica-toolkit/tests/snapshot/frequency/031-frequency-or.toml @@ -0,0 +1,5 @@ +bin.name = "pica" +args = "frequency -s \"002@.0\" --where \"002@.0 == 'Ts1'\" --or \"002@.0 == 'Tsz'\" dump.dat.gz" +status = "success" +stdout = "Ts1,2\nTsz,1\n" +stderr = "" diff --git a/crates/pica-toolkit/tests/snapshot/frequency/032-frequency-and.in/dump.dat.gz b/crates/pica-toolkit/tests/snapshot/frequency/032-frequency-and.in/dump.dat.gz new file mode 100644 index 000000000..06c3230dd Binary files /dev/null and b/crates/pica-toolkit/tests/snapshot/frequency/032-frequency-and.in/dump.dat.gz differ diff --git a/crates/pica-toolkit/tests/snapshot/frequency/032-frequency-and.toml b/crates/pica-toolkit/tests/snapshot/frequency/032-frequency-and.toml new file mode 100644 index 000000000..f566bab4c --- /dev/null +++ b/crates/pica-toolkit/tests/snapshot/frequency/032-frequency-and.toml @@ -0,0 +1,5 @@ +bin.name = "pica" +args = "frequency -s \"002@.0\" --where \"002@.0 =^ 'Ts'\" --and \"002@.0 != 'Tsz'\" dump.dat.gz" +status = "success" +stdout = "Ts1,2\n" +stderr = "" diff --git a/crates/pica-toolkit/tests/snapshot/frequency/033-frequency-not.in/dump.dat.gz b/crates/pica-toolkit/tests/snapshot/frequency/033-frequency-not.in/dump.dat.gz new file mode 100644 index 000000000..06c3230dd Binary files /dev/null and b/crates/pica-toolkit/tests/snapshot/frequency/033-frequency-not.in/dump.dat.gz differ diff --git a/crates/pica-toolkit/tests/snapshot/frequency/033-frequency-not.toml b/crates/pica-toolkit/tests/snapshot/frequency/033-frequency-not.toml new file mode 100644 index 000000000..5a610daf7 --- /dev/null +++ b/crates/pica-toolkit/tests/snapshot/frequency/033-frequency-not.toml @@ -0,0 +1,5 @@ +bin.name = "pica" +args = "frequency -s \"002@.0\" --where \"002@.0 =^ 'Ts'\" --not \"002@.0 == 'Tsz'\" dump.dat.gz" +status = "success" +stdout = "Ts1,2\n" +stderr = "" diff --git a/docs/book/src/referenz/kommandos/frequency.md b/docs/book/src/referenz/kommandos/frequency.md index 553e27c8b..3bdb5b577 100644 --- a/docs/book/src/referenz/kommandos/frequency.md +++ b/docs/book/src/referenz/kommandos/frequency.md @@ -40,6 +40,17 @@ Ts1,1 * `-l`, `--limit` `` — Eingrenzung der Ausgabe auf die häufigsten `` Unterfeldwerte. * `--threshold` `` — Zeilen mit einer Häufigkeit < `` ignorieren. +* `--where` `` — Angabe eines Filters, der auf die erzeugten + Datensätze angewandt wird. +* `--and` `` — Hinzufügen eines zusätzlichen Filters mittels der + booleschen `&&`-Verknüpfung. Der ursprüngliche Filterausdruck + `` wird zum Ausdruck ` && `. +* `--or` `` — Hinzufügen eines zusätzlichen Filters mittels der + booleschen `||`-Verknüpfung. Der ursprüngliche Filterausdruck + `` wird zum Ausdruck ` || `. +* `--not` `` — Hinzufügen eines zusätzlichen Filters. Der + ursprüngliche Filterausdruck `` wird zum Ausdruck ` && + !()`. * `-H`, `--header` `
` — Kopfzeile, die den Ergebnissen vorangestellt wird. * `-t`, `--tsv` — Ausgabe erfolgt im TSV-Format.