Skip to content

Commit

Permalink
frequency: add filter options (#750)
Browse files Browse the repository at this point in the history
  • Loading branch information
nwagner84 authored Feb 1, 2024
1 parent a4c9e79 commit f603116
Show file tree
Hide file tree
Showing 10 changed files with 85 additions and 0 deletions.
54 changes: 54 additions & 0 deletions crates/pica-toolkit/src/commands/frequency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use std::io::{self, Write};
use std::str::FromStr;

use clap::{value_parser, Parser};
use pica_matcher::{MatcherBuilder, MatcherOptions};
use pica_record::io::{ReaderBuilder, RecordsIterator};
use pica_select::{Query, QueryExt, QueryOptions};
use pica_utils::NormalizationForm;
Expand Down Expand Up @@ -78,6 +79,31 @@ pub(crate) struct Frequency {
)]
threshold: u64,

/// A filter expression used for searching
#[arg(long = "where")]
filter: Option<String>,

/// Connects the where clause with additional expressions using the
/// logical AND-operator (conjunction)
///
/// This option can't be combined with `--or` or `--not`.
#[arg(long, requires = "filter", conflicts_with_all = ["or", "not"])]
and: Vec<String>,

/// Connects the where clause with additional expressions using the
/// logical OR-operator (disjunction)
///
/// This option can't be combined with `--and` or `--not`.
#[arg(long, requires = "filter", conflicts_with_all = ["and", "not"])]
or: Vec<String>,

/// Connects the where clause with additional expressions using the
/// logical NOT-operator (negation)
///
/// This option can't be combined with `--and` or `--or`.
#[arg(long, requires = "filter", conflicts_with_all = ["and", "or"])]
not: Vec<String>,

/// Comma-separated list of column names.
#[arg(long, short = 'H')]
header: Option<String>,
Expand Down Expand Up @@ -126,6 +152,24 @@ impl Frequency {

let query = Query::from_str(&query)?;

let nf = if let Some(ref global) = config.global {
global.translit
} else {
None
};

let matcher = if let Some(matcher) = self.filter {
Some(
MatcherBuilder::new(matcher, nf)?
.and(self.and)?
.not(self.not)?
.or(self.or)?
.build(),
)
} else {
None
};

let mut ftable: HashMap<Vec<String>, u64> = HashMap::new();
let options = QueryOptions::new()
.strsim_threshold(self.strsim_threshold as f64 / 100f64)
Expand Down Expand Up @@ -158,6 +202,16 @@ impl Frequency {
}

let record = result.unwrap();

if let Some(ref matcher) = matcher {
if !matcher.is_match(
&record,
&MatcherOptions::from(&options),
) {
continue;
}
}

progress.record();
seen.clear();

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bin.name = "pica"
args = "frequency -s \"[email protected]\" --where \"[email protected] =^ 'Ts'\" dump.dat.gz"
status = "success"
stdout = "Ts1,2\nTsz,1\n"
stderr = ""
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bin.name = "pica"
args = "frequency -s \"[email protected]\" --where \"[email protected] == 'Ts1'\" --or \"[email protected] == 'Tsz'\" dump.dat.gz"
status = "success"
stdout = "Ts1,2\nTsz,1\n"
stderr = ""
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bin.name = "pica"
args = "frequency -s \"[email protected]\" --where \"[email protected] =^ 'Ts'\" --and \"[email protected] != 'Tsz'\" dump.dat.gz"
status = "success"
stdout = "Ts1,2\n"
stderr = ""
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bin.name = "pica"
args = "frequency -s \"[email protected]\" --where \"[email protected] =^ 'Ts'\" --not \"[email protected] == 'Tsz'\" dump.dat.gz"
status = "success"
stdout = "Ts1,2\n"
stderr = ""
11 changes: 11 additions & 0 deletions docs/book/src/referenz/kommandos/frequency.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ Ts1,1
* `-l`, `--limit` `<n>` — Eingrenzung der Ausgabe auf die häufigsten
`<n>` Unterfeldwerte.
* `--threshold` `<n>` — Zeilen mit einer Häufigkeit < `<n>` ignorieren.
* `--where` `<filter>` — Angabe eines Filters, der auf die erzeugten
Datensätze angewandt wird.
* `--and` `<expr>` — Hinzufügen eines zusätzlichen Filters mittels der
booleschen `&&`-Verknüpfung. Der ursprüngliche Filterausdruck
`<filter>` wird zum Ausdruck `<filter> && <expr>`.
* `--or` `<expr>` — Hinzufügen eines zusätzlichen Filters mittels der
booleschen `||`-Verknüpfung. Der ursprüngliche Filterausdruck
`<filter>` wird zum Ausdruck `<filter> || <expr>`.
* `--not` `<expr>` — Hinzufügen eines zusätzlichen Filters. Der
ursprüngliche Filterausdruck `<filter>` wird zum Ausdruck `<filter> &&
!(<expr>)`.
* `-H`, `--header` `<header>` — Kopfzeile, die den Ergebnissen
vorangestellt wird.
* `-t`, `--tsv` — Ausgabe erfolgt im TSV-Format.
Expand Down

0 comments on commit f603116

Please sign in to comment.