From 3296344fb95ff4a42c4a1ddab3846874d24b6f0e Mon Sep 17 00:00:00 2001 From: Nico Wagner Date: Wed, 16 Aug 2023 17:28:10 +0200 Subject: [PATCH] Adapt select/frequency to new path struct (#686) --- pica-matcher/src/subfield_matcher.rs | 14 +++---- pica-matcher/src/tag_matcher.rs | 2 +- pica-path/src/lib.rs | 2 +- pica-select/src/lib.rs | 15 +++++-- pica-toolkit/src/commands/frequency.rs | 38 ++++++++++-------- .../dump.dat.gz | Bin 0 -> 3675 bytes .../027-frequency-multiple-cols.toml | 5 +++ 7 files changed, 47 insertions(+), 29 deletions(-) create mode 100644 pica-toolkit/tests/snapshot/frequency/027-frequency-multiple-cols.in/dump.dat.gz create mode 100644 pica-toolkit/tests/snapshot/frequency/027-frequency-multiple-cols.toml diff --git a/pica-matcher/src/subfield_matcher.rs b/pica-matcher/src/subfield_matcher.rs index ae086682c..958c27982 100644 --- a/pica-matcher/src/subfield_matcher.rs +++ b/pica-matcher/src/subfield_matcher.rs @@ -52,7 +52,7 @@ pub trait Matcher { /// This matcher can be used to determine if a single subfield or a list /// of subfields contains at least one subfield with a code, that is /// contained in the matcher's code list. -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct ExistsMatcher { codes: Vec, } @@ -139,7 +139,7 @@ impl Matcher for ExistsMatcher { /// * StartsWith (`=^`) /// * EndsWith (`=$`) /// * Similar (`=*`) -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct RelationMatcher { codes: Vec, op: RelationalOp, @@ -331,7 +331,7 @@ impl Matcher for RelationMatcher { } /// A matcher that checks a subfield value against a regex. -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct RegexMatcher { codes: Vec, pattern: String, @@ -421,7 +421,7 @@ impl Matcher for RegexMatcher { } /// A matcher that checks if a subfield value is in a predefined list. -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct InMatcher { codes: Vec, values: Vec, @@ -520,7 +520,7 @@ impl Matcher for InMatcher { } /// A matcher that checks the number of occurrences of a subfield. -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct CardinalityMatcher { code: char, op: RelationalOp, @@ -615,7 +615,7 @@ impl Matcher for CardinalityMatcher { /// /// This matcher combines all atomic, singleton matcher into a new /// matcher. -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum SingletonMatcher { Cardinality(CardinalityMatcher), Exists(ExistsMatcher), @@ -685,7 +685,7 @@ impl Matcher for SingletonMatcher { /// A matcher that allows grouping, negation and connecting of singleton /// matcher. -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum SubfieldMatcher { Singleton(SingletonMatcher), Group(Box), diff --git a/pica-matcher/src/tag_matcher.rs b/pica-matcher/src/tag_matcher.rs index 430e5cf5f..852a3c2b7 100644 --- a/pica-matcher/src/tag_matcher.rs +++ b/pica-matcher/src/tag_matcher.rs @@ -12,7 +12,7 @@ use pica_record::{Tag, TagMut}; use crate::ParseMatcherError; /// A matcher that matches against PICA+ [Tags](`pica_record::Tag`). -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum TagMatcher { Simple(TagMut), Pattern([Vec; 4]), diff --git a/pica-path/src/lib.rs b/pica-path/src/lib.rs index 1ab9a8cb2..668e02a51 100644 --- a/pica-path/src/lib.rs +++ b/pica-path/src/lib.rs @@ -26,7 +26,7 @@ use thiserror::Error; #[error("invalid path expression, got `{0}`")] pub struct ParsePathError(String); -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct Path { tag_matcher: TagMatcher, occurrence_matcher: OccurrenceMatcher, diff --git a/pica-select/src/lib.rs b/pica-select/src/lib.rs index f8525bb0d..d6e6672d5 100644 --- a/pica-select/src/lib.rs +++ b/pica-select/src/lib.rs @@ -104,6 +104,12 @@ impl FromStr for Query { } } +impl From for Query { + fn from(path: Path) -> Self { + Self(vec![path.into()]) + } +} + #[derive(Debug, Copy, Clone)] enum Quotes { Single, @@ -467,15 +473,16 @@ impl + Debug + Display> QueryExt for Record { } }) .map(|field| { - // FIXME - path.codes_flat() + path.codes() .iter() - .map(|code| { + .map(|codes| { field .subfields() .iter() .filter(|subfield| { - subfield.code() == *code + codes.contains( + &subfield.code(), + ) }) .map(|subfield| { subfield.value() diff --git a/pica-toolkit/src/commands/frequency.rs b/pica-toolkit/src/commands/frequency.rs index 109f0e624..83610f0e0 100644 --- a/pica-toolkit/src/commands/frequency.rs +++ b/pica-toolkit/src/commands/frequency.rs @@ -5,11 +5,10 @@ use std::fs::File; use std::io::{self, Write}; use std::str::FromStr; -use bstr::BString; use clap::{value_parser, Parser}; -use pica_matcher::MatcherOptions; -use pica_path::{Path, PathExt}; +use pica_path::Path; use pica_record::io::{ReaderBuilder, RecordsIterator}; +use pica_select::{Query, QueryExt, QueryOptions}; use serde::{Deserialize, Serialize}; use crate::config::Config; @@ -123,8 +122,8 @@ impl Frequency { Path::from_str(&self.path)? }; - let mut ftable: HashMap = HashMap::new(); - let options = MatcherOptions::new() + let mut ftable: HashMap, u64> = HashMap::new(); + let options = QueryOptions::new() .strsim_threshold(self.strsim_threshold as f64 / 100f64) .case_ignore(self.ignore_case); @@ -151,10 +150,15 @@ impl Frequency { } } Ok(record) => { - for value in record.path(&path, &options) { - *ftable - .entry(BString::from(value.to_vec())) - .or_insert(0) += 1; + let outcome = record.query( + &Query::from(path.clone()), + &options, + ); + + for key in outcome.clone().into_iter() { + if key.iter().any(|e| !e.is_empty()) { + *ftable.entry(key).or_insert(0) += 1; + } } } } @@ -165,8 +169,9 @@ impl Frequency { writer.write_record(header.split(',').map(|s| s.trim()))?; } - let mut ftable_sorted: Vec<(&BString, &u64)> = + let mut ftable_sorted: Vec<(&Vec, &u64)> = ftable.iter().collect(); + if self.reverse { ftable_sorted.sort_by(|a, b| match a.1.cmp(b.1) { Ordering::Equal => a.0.cmp(b.0), @@ -179,7 +184,7 @@ impl Frequency { }); } - for (i, (value, frequency)) in ftable_sorted.iter().enumerate() + for (i, (values, frequency)) in ftable_sorted.iter().enumerate() { if self.limit > 0 && i >= self.limit { break; @@ -189,12 +194,13 @@ impl Frequency { break; } - let value = translit_maybe( - &value.to_string(), - self.translit.as_deref(), - ); + let mut record = values + .iter() + .map(|s| translit_maybe(s, self.translit.as_deref())) + .collect::>(); - writer.write_record(&[value, frequency.to_string()])?; + record.push(frequency.to_string()); + writer.write_record(record)?; } writer.flush()?; diff --git a/pica-toolkit/tests/snapshot/frequency/027-frequency-multiple-cols.in/dump.dat.gz b/pica-toolkit/tests/snapshot/frequency/027-frequency-multiple-cols.in/dump.dat.gz new file mode 100644 index 0000000000000000000000000000000000000000..06c3230dd45b9f0f5b47f7a52e18dc04e238c1b3 GIT binary patch literal 3675 zcmV-h4y5rPiwFohCthO!17vk=a4uwFbO5b7%W~t$k#~PYSM9+OJq`gv;Z3#eF!g9@ zMoo@ZQa{GGB2gq^f&h#MrRK>V_1Jyz#k)HZ_Bvw58|&~RCLBJ@(MMl(*SSMK;>-d8 z0^&pc7)cRAWoBhoRX!@Ssx*zYDbbj2X*(KIG)HmW2b#uqOQ3ElnyIiB$(XfcSv!WM zYMO>5`#{oypB?5X*tLbgrvS{ef#mT5zL8S@34ENV3_cC~)J#A!%f=1{bl{^bi+8rS zJtgd^emID>hoQGUCsK5vGGRD86qU+Zq zku=~ayw*HfG)X?6bD@KZ`!pcunxnWj#h^K?d@fi_8du1ASbT7-9a zHWIk(V`4@<9zbUumrw_LGM)CeW>6yb4-_U9H3uXMz#b)MXa#<{Sm(N(SlR(_>uk|> z{-ga}Y#%{88o}_NhM7oE+r9@=po9GSbwWK~(0xCh`ElTf6lO;kBcyrgNrJhc!c@+( zBp+va@<4Mm7@iIT!X*Qvhab~%AAC>SL7 z%HtSwEoNC3+vsr>6`8HZgB_TT?l`4K#(vWy>tduw7SLKZ35^@jxu4cB)i9PY2Gd)x z0;~zu9o+ykweH3N*I$e=c&f*Bnk$5-%bMx1muWr63{N}gt_2td~uA==U zhEbf{Uj6R&>YLlEuWqlty}kP9?bVmJpa1^$>hHH#SGQMxsP}7EW3SKXWC=PCUm$6iX1ja{Ty6l zQVZRt1$uNabhBpYmI%cp{AA<@Q5wY~97EEt_0?U+W!8$m5~0|He-hw>T6a?1CqmN)-xG%YOTv4WpgU?}}-J-FpZZk=W93?4KTV+sEa$*iQN*e=2vm z_CzFppNI4?_XCd(qJ$PvjPB(D>~1`uV70=FCR?;CF6d`bGN%57?u$IbZ7`$G(&egO^QM=G`?XvY`uMIX_wz|c}jGxC5Fr!cWOs%%=ih~~w6O|s(gNY9xCHT2N z_G97si}cD9>O~1u!TYwcwLM>w?^}=OF>KEM5iN%Hh(3w-ly3VGh2cD+i-7<^Lo)UR z)%T9&y2PauY|%VP;{=k3a0szvMDJMI78Tgr7YXD~us|-w`~uA5=^4m;#vvC0MWnO} zc0lSS$fz1GN?p&Sq-c7_7X9@4HGC(D@X<0Pp{;?daE+-9{vbS)o}sClrDBvOCZvd$ zB~5{?;E?nRpGpKe`EwuRQ zfO#G9sR*~|UKEC^WRZS2i;LRe9oNxJbH(5#LU9O(g_6B4$xm^i=h+>&sP+)|M%VqQ z=7c5g?Bo42^iD`iG|RS^5KwhM2b;;95Ysf}!N%lJXGJ;YD6pMojP01M9ZP%9a%@;H z$CwMlzposVL{L|)P7ZYl=kN>p)|%u|b9A@GY)4y501Jpag|U{PrRzHGf@S^*s)A4Z zGg#4~h(e6EMAKWk1)Ia2QgJ@`W@1`rSO=E_xdSdI&;VESuApNiwsi+G3rk@Q@M4i_ zsqH*nEtj{Ju6i_;2=o+Khog(7#bk3ks;mOoJl{j<01`}UMYoN5v`FvI1p z4Puz+^=oflx4%=*4Xbv+A-LaNsk_Mc;D5?9x!p=uP3k7<1IAuyJW608MwlV3n#D5N zd&oTDNdQh5-mmR;(i47`4wF2N7e#X%@bGOc5;n2H0L0+VIz~Q;7>I;t_e`y3^Y1aa z_90!8%jGog?Rv!XI9qap3i`g!o)=QxcjeIrv|uRUJ~zsLJXxQqfk?ddSlR~kS|fqrj|v9 z^@dC%Q`n2ri&QN)8JS1U_Zgy>&{C8qMFDfwc40&|h^jRPU0@EWmW4+j!+PVLL{;?& zq#s8I?|3>kT&0ToCRxzpmW5@+g7$|<`Qyf>R}>PRj$eL8gTcc>=0`s`=#R${_@5s_ zMjO&uKA}vtRb68ay?~4sv)ml21lqxa#30VYaRwb!3a%=$8PrJaAQM~EPm&Bwc!CB8 z&F7f$?$APmI{fKSU(EICQjO7VTIi4&6tZGv-qJMCC|$q4^br&=Gk?MZ;EgJiwG#H1 zEf-QJXaR3jnyx##lq*{rs?goQyIh^!yg%xR)X!!+^r+q5g}H#hq%1&Al4yz*LO6p8 zYz!ko4-u1C1SQ3%ko_thSPYfosc2bGfyLWYuj-&f5h?rma@hp%Zec+6(rf#$yI(?% zqKSy1sFki6dOH-W0u5ExT3G1C#f3U3twQz0c2#bm)6$2qi%?8&KfasQNNem=;rKVQX5j!6l* z(&0bDBd_|X)<~Cd$h0~ljP4+SmruDzJCO+U37I~NV80pOWs6kMUDCW03Ia=w`KL+G zPiX*YYM7_1h&M61i#rklOec;4Ki$B2Zel9!sH?zDs8o}Z=7|yrW#zgT#(LKn>N6eQ z8U0v)(X_2HXlG5H!sLx}@*5b6DpX$jtNkLWxUWz_Gq* zh=^9twkEuW>HO=@VMGhOTt^Y6ex<<<7aOqW3HL?6VPiGa&(@#{MXy)~B#Q=>LV0uT zE^FDf4un%{QnZZen)Tq`{t{g-^1_Mbz*t#!112p z>Th`G@cBRF_cym!ugMVaAwK^TfS1@mdFgxi$XGgZb&YsWBmG*&-dS?ag`bR}P0y+Jn@lLK?(@rvV^EoK?=yg;69T%gWtM;LX2#j!yJ zU3uFb0UJoCb-6ip_sI|vr!fNBG8KSo0IPUGyHL}6kfHe{MS1+L8!eFVqBy&sABq() zkjNL^%&3RPP@J3Fs`HNdXtsJZ^oKZ_A2;S+Bq{sdUFDc(Jcx$nCGIpTE@%OT$Dqei z6k|!~19;+NEgjNlBI!Y{0HsbAiX@~%fw#OPjAFxu_P7QVmu-kT{j^1Qs{wahjdv0SgQ!hY2s@j)h! z*gx>$MC}nq-D2kTizsS+Oy|=WFB0g(^C-43*H>z@N-yy4L!QF%5S_ACimZ4U;aJ)| zFC)sgNKWBdI=+msjPKwr5~$3vOnb9u6-4roceQ8L2x^{HLf5`UDxC(ZH`QS4-XaCV zDDkt=M4oD<^a%tn5ti7ZMTW<6!R48ZpdW`=Px)Y6vx-Ze;fp5W(ExN9f<=~?4)2f9 zChPFRFH?zt!mKXzWJQ+^Uh1GrG`zHceDG_gI8H>Ef$>8fL3uGdbl7$N>N*xI@m+tF z(C^@N6O_BQZLhfIE)aMFRxBZ2>z%!XdUrz1Y0Xs)?R{KzNka`=oEQ1xQeB+fk5@P# t(@dj<4ru&`^fkk@Y{zZAXgnoiI`q9Ggj`!_#xR)#$&008dW7s&ts literal 0 HcmV?d00001 diff --git a/pica-toolkit/tests/snapshot/frequency/027-frequency-multiple-cols.toml b/pica-toolkit/tests/snapshot/frequency/027-frequency-multiple-cols.toml new file mode 100644 index 000000000..7fa2d4f65 --- /dev/null +++ b/pica-toolkit/tests/snapshot/frequency/027-frequency-multiple-cols.toml @@ -0,0 +1,5 @@ +bin.name = "pica" +args = "frequency -s \"065R{ (9, 4) | 9? && 4 =^ 'ort' }\" dump.dat.gz" +status = "success" +stdout = "040660095,ortg,1\n040787044,ortx,1\n041178548,orts,1\n041178548,ortw,1\n" +stderr = ""