Skip to content

Commit

Permalink
Adapt select/frequency to new path struct (#686)
Browse files Browse the repository at this point in the history
  • Loading branch information
nwagner84 authored Aug 16, 2023
1 parent 061ecd4 commit 3296344
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 29 deletions.
14 changes: 7 additions & 7 deletions pica-matcher/src/subfield_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pub trait Matcher {
/// This matcher can be used to determine if a single subfield or a list
/// of subfields contains at least one subfield with a code, that is
/// contained in the matcher's code list.
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ExistsMatcher {
codes: Vec<char>,
}
Expand Down Expand Up @@ -139,7 +139,7 @@ impl Matcher for ExistsMatcher {
/// * StartsWith (`=^`)
/// * EndsWith (`=$`)
/// * Similar (`=*`)
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct RelationMatcher {
codes: Vec<char>,
op: RelationalOp,
Expand Down Expand Up @@ -331,7 +331,7 @@ impl Matcher for RelationMatcher {
}

/// A matcher that checks a subfield value against a regex.
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct RegexMatcher {
codes: Vec<char>,
pattern: String,
Expand Down Expand Up @@ -421,7 +421,7 @@ impl Matcher for RegexMatcher {
}

/// A matcher that checks if a subfield value is in a predefined list.
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct InMatcher {
codes: Vec<char>,
values: Vec<BString>,
Expand Down Expand Up @@ -520,7 +520,7 @@ impl Matcher for InMatcher {
}

/// A matcher that checks the number of occurrences of a subfield.
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CardinalityMatcher {
code: char,
op: RelationalOp,
Expand Down Expand Up @@ -615,7 +615,7 @@ impl Matcher for CardinalityMatcher {
///
/// This matcher combines all atomic, singleton matcher into a new
/// matcher.
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum SingletonMatcher {
Cardinality(CardinalityMatcher),
Exists(ExistsMatcher),
Expand Down Expand Up @@ -685,7 +685,7 @@ impl Matcher for SingletonMatcher {

/// A matcher that allows grouping, negation and connecting of singleton
/// matcher.
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum SubfieldMatcher {
Singleton(SingletonMatcher),
Group(Box<SubfieldMatcher>),
Expand Down
2 changes: 1 addition & 1 deletion pica-matcher/src/tag_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use pica_record::{Tag, TagMut};
use crate::ParseMatcherError;

/// A matcher that matches against PICA+ [Tags](`pica_record::Tag`).
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum TagMatcher {
Simple(TagMut),
Pattern([Vec<char>; 4]),
Expand Down
2 changes: 1 addition & 1 deletion pica-path/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use thiserror::Error;
#[error("invalid path expression, got `{0}`")]
pub struct ParsePathError(String);

#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Path {
tag_matcher: TagMatcher,
occurrence_matcher: OccurrenceMatcher,
Expand Down
15 changes: 11 additions & 4 deletions pica-select/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ impl FromStr for Query {
}
}

impl From<Path> for Query {
fn from(path: Path) -> Self {
Self(vec![path.into()])
}
}

#[derive(Debug, Copy, Clone)]
enum Quotes {
Single,
Expand Down Expand Up @@ -467,15 +473,16 @@ impl<T: AsRef<[u8]> + Debug + Display> QueryExt for Record<T> {
}
})
.map(|field| {
// FIXME
path.codes_flat()
path.codes()
.iter()
.map(|code| {
.map(|codes| {
field
.subfields()
.iter()
.filter(|subfield| {
subfield.code() == *code
codes.contains(
&subfield.code(),
)
})
.map(|subfield| {
subfield.value()
Expand Down
38 changes: 22 additions & 16 deletions pica-toolkit/src/commands/frequency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ use std::fs::File;
use std::io::{self, Write};
use std::str::FromStr;

use bstr::BString;
use clap::{value_parser, Parser};
use pica_matcher::MatcherOptions;
use pica_path::{Path, PathExt};
use pica_path::Path;
use pica_record::io::{ReaderBuilder, RecordsIterator};
use pica_select::{Query, QueryExt, QueryOptions};
use serde::{Deserialize, Serialize};

use crate::config::Config;
Expand Down Expand Up @@ -123,8 +122,8 @@ impl Frequency {
Path::from_str(&self.path)?
};

let mut ftable: HashMap<BString, u64> = HashMap::new();
let options = MatcherOptions::new()
let mut ftable: HashMap<Vec<String>, u64> = HashMap::new();
let options = QueryOptions::new()
.strsim_threshold(self.strsim_threshold as f64 / 100f64)
.case_ignore(self.ignore_case);

Expand All @@ -151,10 +150,15 @@ impl Frequency {
}
}
Ok(record) => {
for value in record.path(&path, &options) {
*ftable
.entry(BString::from(value.to_vec()))
.or_insert(0) += 1;
let outcome = record.query(
&Query::from(path.clone()),
&options,
);

for key in outcome.clone().into_iter() {
if key.iter().any(|e| !e.is_empty()) {
*ftable.entry(key).or_insert(0) += 1;
}
}
}
}
Expand All @@ -165,8 +169,9 @@ impl Frequency {
writer.write_record(header.split(',').map(|s| s.trim()))?;
}

let mut ftable_sorted: Vec<(&BString, &u64)> =
let mut ftable_sorted: Vec<(&Vec<String>, &u64)> =
ftable.iter().collect();

if self.reverse {
ftable_sorted.sort_by(|a, b| match a.1.cmp(b.1) {
Ordering::Equal => a.0.cmp(b.0),
Expand All @@ -179,7 +184,7 @@ impl Frequency {
});
}

for (i, (value, frequency)) in ftable_sorted.iter().enumerate()
for (i, (values, frequency)) in ftable_sorted.iter().enumerate()
{
if self.limit > 0 && i >= self.limit {
break;
Expand All @@ -189,12 +194,13 @@ impl Frequency {
break;
}

let value = translit_maybe(
&value.to_string(),
self.translit.as_deref(),
);
let mut record = values
.iter()
.map(|s| translit_maybe(s, self.translit.as_deref()))
.collect::<Vec<_>>();

writer.write_record(&[value, frequency.to_string()])?;
record.push(frequency.to_string());
writer.write_record(record)?;
}

writer.flush()?;
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bin.name = "pica"
args = "frequency -s \"065R{ (9, 4) | 9? && 4 =^ 'ort' }\" dump.dat.gz"
status = "success"
stdout = "040660095,ortg,1\n040787044,ortx,1\n041178548,orts,1\n041178548,ortw,1\n"
stderr = ""

0 comments on commit 3296344

Please sign in to comment.