From 1ce384b97c5e3ccc387dc266636cf646dfc53d99 Mon Sep 17 00:00:00 2001 From: Bryant Luk Date: Sat, 16 Dec 2023 10:40:48 -0600 Subject: [PATCH] Add options to tokencount example --- examples/tokencount/Cargo.toml | 9 +- examples/tokencount/src/main.rs | 168 ++++++++++++++++++++------------ examples/tokencount/testing.nu | 15 ++- maybe_xml/src/lib.rs | 4 +- 4 files changed, 118 insertions(+), 78 deletions(-) diff --git a/examples/tokencount/Cargo.toml b/examples/tokencount/Cargo.toml index 9438cf6..f701a02 100644 --- a/examples/tokencount/Cargo.toml +++ b/examples/tokencount/Cargo.toml @@ -5,11 +5,6 @@ name = "tokencount" publish = false version = "0.0.0" -[features] -default = ["internal_unstable"] - -# Exposing internal and unstable APIs -internal_unstable = ["maybe_xml/internal_unstable"] - [dependencies] -maybe_xml = { path = "../../maybe_xml" } +clap = { version = "4.4.11", features = ["derive"] } +maybe_xml = { path = "../../maybe_xml", features = ["internal_unstable"] } diff --git a/examples/tokencount/src/main.rs b/examples/tokencount/src/main.rs index 3f02195..11e4c57 100644 --- a/examples/tokencount/src/main.rs +++ b/examples/tokencount/src/main.rs @@ -1,5 +1,8 @@ use std::io; +use clap::{Parser, Subcommand}; +use maybe_xml::ScanDocumentOpts; + use maybe_xml::{token, Reader}; #[derive(Debug, Default)] @@ -14,84 +17,121 @@ struct TokenCounters { cdata: usize, } +#[derive(Debug, Parser)] +struct Args { + #[command(subcommand)] + cmd: Option, +} + +#[derive(Debug, Subcommand)] +enum Cmd { + Count, + VerifyStrictXml, + VerifyRelaxed, + VerifyAssumeXml, +} + +#[inline] +#[must_use] +fn is_utf8_bom(input: &[u8]) -> bool { + if input.len() < 4 { + return false; + } + + input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF +} + +#[inline] +#[must_use] +fn complete_scan(input: &[u8], opts: ScanDocumentOpts) -> bool { + let input = if is_utf8_bom(input) { + &input[3..] + } else { + input + }; + + maybe_xml::scan_document(input, 0, opts) == Some(input.len()) +} + fn main() -> io::Result<()> { + let args = Args::parse(); + let stdin = io::read_to_string(io::stdin())?; - let mut counters = TokenCounters::default(); + match args.cmd { + Some(Cmd::Count) | None => { + let mut counters = TokenCounters::default(); - let reader = Reader::from_str(&stdin); + let reader = Reader::from_str(&stdin); - let mut pos = 0; + let mut pos = 0; - while let Some(token) = reader.tokenize(&mut pos) { - match token.ty() { - token::Ty::StartTag(_) => { - counters.start_tag += 1; - } - token::Ty::EmptyElementTag(_) => { - counters.empty_element_tag += 1; - } - token::Ty::EndTag(_) => { - counters.end_tag += 1; - } - token::Ty::Characters(_) => { - counters.chars += 1; + while let Some(token) = reader.tokenize(&mut pos) { + match token.ty() { + token::Ty::StartTag(_) => { + counters.start_tag += 1; + } + token::Ty::EmptyElementTag(_) => { + counters.empty_element_tag += 1; + } + token::Ty::EndTag(_) => { + counters.end_tag += 1; + } + token::Ty::Characters(_) => { + counters.chars += 1; + } + token::Ty::ProcessingInstruction(_) => { + counters.pi += 1; + } + token::Ty::Declaration(_) => { + counters.decl += 1; + } + token::Ty::Comment(_) => { + counters.comment += 1; + } + token::Ty::Cdata(_) => { + counters.cdata += 1; + } + } } - token::Ty::ProcessingInstruction(_) => { - counters.pi += 1; - } - token::Ty::Declaration(_) => { - counters.decl += 1; - } - token::Ty::Comment(_) => { - counters.comment += 1; - } - token::Ty::Cdata(_) => { - counters.cdata += 1; - } - } - } - - if pos != stdin.len() { - let error = format!("should have read the entire stdin but only read to {pos} bytes"); - return Err(io::Error::new(io::ErrorKind::Other, error)); - } - #[cfg(feature = "internal_unstable")] - { - use maybe_xml::ScanDocumentOpts; + if pos != stdin.len() { + let error = + format!("should have read the entire stdin but only read to {pos} bytes"); + return Err(io::Error::new(io::ErrorKind::Other, error)); + } - if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::new()) - != Some(stdin.len()) - { - let error = "scan_document with DEFAULT options should have read the entire stdin"; - return Err(io::Error::new(io::ErrorKind::Other, error)); + println!("Start Tag: {}", counters.start_tag); + println!("Empty Element Tag: {}", counters.empty_element_tag); + println!("End Tag: {}", counters.end_tag); + println!("Characters: {}", counters.chars); + println!("Processing Instruction: {}", counters.pi); + println!("Declaration: {}", counters.decl); + println!("Comment: {}", counters.comment); + println!("Cdata: {}", counters.cdata); } - - if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::relaxed()) - != Some(stdin.len()) - { - let error = "scan_document with RELAXED options should have read the entire stdin"; - return Err(io::Error::new(io::ErrorKind::Other, error)); + Some(Cmd::VerifyStrictXml) => { + if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::new()) { + let error = "scan_document with DEFAULT options should have read the entire stdin"; + return Err(io::Error::new(io::ErrorKind::Other, error)); + } } - - if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::assume_valid_xml()) - != Some(stdin.len()) - { - let error = - "scan_document with ASSUME VALID XML options should have read the entire stdin"; - return Err(io::Error::new(io::ErrorKind::Other, error)); + Some(Cmd::VerifyRelaxed) => { + if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::relaxed()) { + let error = "scan_document with RELAXED options should have read the entire stdin"; + return Err(io::Error::new(io::ErrorKind::Other, error)); + } + } + Some(Cmd::VerifyAssumeXml) => { + if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::assume_valid_xml()) { + let error = + "scan_document with ASSUME VALID XML options should have read the entire stdin"; + return Err(io::Error::new(io::ErrorKind::Other, error)); + } } } - println!("Start Tag: {}", counters.start_tag); - println!("Empty Element Tag: {}", counters.empty_element_tag); - println!("End Tag: {}", counters.end_tag); - println!("Characters: {}", counters.chars); - println!("Processing Instruction: {}", counters.pi); - println!("Declaration: {}", counters.decl); - println!("Comment: {}", counters.comment); - println!("Cdata: {}", counters.cdata); + println!("OK"); Ok(()) } diff --git a/examples/tokencount/testing.nu b/examples/tokencount/testing.nu index 456bb67..e89db0e 100644 --- a/examples/tokencount/testing.nu +++ b/examples/tokencount/testing.nu @@ -9,17 +9,22 @@ # # Example: # -# maybe_xml_find_docs "DOCTYPE" 50 | maybe_xml_tc +# maybe_xml_find_docs "DOCTYPE" 50 | maybe_xml_tc | select url tc.exit_code strict.exit_code relaxed.exit_code assume.exit_code | where strict_exit_code != 0 or relaxed_exit_code != 0 or assume_exit_code != 0 def maybe_xml_tc [] { each { |it| - let tc = do { http get $it --raw | tokencount } | complete + let text = http get $it --raw + let tc = do { $text | tokencount count } | complete + let strict = do { $text | tokencount verify-strict-xml } | complete + let relaxed = do { $text | tokencount verify-relaxed } | complete + let assume = do { $text | tokencount verify-assume-xml } | complete { url: $it, - tc_exit_code: $tc.exit_code, - tc_stderr: $tc.stderr, - tc_stdout: $tc.stdout, + tc: $tc + strict: $strict + relaxed: $relaxed + assume: $assume } } } diff --git a/maybe_xml/src/lib.rs b/maybe_xml/src/lib.rs index e285409..8c04603 100644 --- a/maybe_xml/src/lib.rs +++ b/maybe_xml/src/lib.rs @@ -122,8 +122,8 @@ const fn is_utf8_boundary(byte: u8) -> bool { byte as i8 >= -0x40 } -#[cfg(feature = "internal_unstable")] +#[cfg(any(test, feature = "internal_unstable"))] pub use read::parser::scan_document; -#[cfg(feature = "internal_unstable")] +#[cfg(any(test, feature = "internal_unstable"))] pub use read::parser::ScanDocumentOpts;