Skip to content

Commit

Permalink
Add options to tokencount example
Browse files Browse the repository at this point in the history
  • Loading branch information
bluk committed Dec 16, 2023
1 parent 6d68a74 commit 1ce384b
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 78 deletions.
9 changes: 2 additions & 7 deletions examples/tokencount/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ name = "tokencount"
publish = false
version = "0.0.0"

[features]
default = ["internal_unstable"]

# Exposing internal and unstable APIs
internal_unstable = ["maybe_xml/internal_unstable"]

[dependencies]
maybe_xml = { path = "../../maybe_xml" }
clap = { version = "4.4.11", features = ["derive"] }
maybe_xml = { path = "../../maybe_xml", features = ["internal_unstable"] }
168 changes: 104 additions & 64 deletions examples/tokencount/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use std::io;

use clap::{Parser, Subcommand};
use maybe_xml::ScanDocumentOpts;

use maybe_xml::{token, Reader};

#[derive(Debug, Default)]
Expand All @@ -14,84 +17,121 @@ struct TokenCounters {
cdata: usize,
}

#[derive(Debug, Parser)]
struct Args {
#[command(subcommand)]
cmd: Option<Cmd>,
}

#[derive(Debug, Subcommand)]
enum Cmd {
Count,
VerifyStrictXml,
VerifyRelaxed,
VerifyAssumeXml,
}

#[inline]
#[must_use]
fn is_utf8_bom(input: &[u8]) -> bool {
if input.len() < 4 {
return false;
}

input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF
}

#[inline]
#[must_use]
fn complete_scan(input: &[u8], opts: ScanDocumentOpts) -> bool {
let input = if is_utf8_bom(input) {
&input[3..]
} else {
input
};

maybe_xml::scan_document(input, 0, opts) == Some(input.len())
}

fn main() -> io::Result<()> {
let args = Args::parse();

let stdin = io::read_to_string(io::stdin())?;

let mut counters = TokenCounters::default();
match args.cmd {
Some(Cmd::Count) | None => {
let mut counters = TokenCounters::default();

let reader = Reader::from_str(&stdin);
let reader = Reader::from_str(&stdin);

let mut pos = 0;
let mut pos = 0;

while let Some(token) = reader.tokenize(&mut pos) {
match token.ty() {
token::Ty::StartTag(_) => {
counters.start_tag += 1;
}
token::Ty::EmptyElementTag(_) => {
counters.empty_element_tag += 1;
}
token::Ty::EndTag(_) => {
counters.end_tag += 1;
}
token::Ty::Characters(_) => {
counters.chars += 1;
while let Some(token) = reader.tokenize(&mut pos) {
match token.ty() {
token::Ty::StartTag(_) => {
counters.start_tag += 1;
}
token::Ty::EmptyElementTag(_) => {
counters.empty_element_tag += 1;
}
token::Ty::EndTag(_) => {
counters.end_tag += 1;
}
token::Ty::Characters(_) => {
counters.chars += 1;
}
token::Ty::ProcessingInstruction(_) => {
counters.pi += 1;
}
token::Ty::Declaration(_) => {
counters.decl += 1;
}
token::Ty::Comment(_) => {
counters.comment += 1;
}
token::Ty::Cdata(_) => {
counters.cdata += 1;
}
}
}
token::Ty::ProcessingInstruction(_) => {
counters.pi += 1;
}
token::Ty::Declaration(_) => {
counters.decl += 1;
}
token::Ty::Comment(_) => {
counters.comment += 1;
}
token::Ty::Cdata(_) => {
counters.cdata += 1;
}
}
}

if pos != stdin.len() {
let error = format!("should have read the entire stdin but only read to {pos} bytes");
return Err(io::Error::new(io::ErrorKind::Other, error));
}

#[cfg(feature = "internal_unstable")]
{
use maybe_xml::ScanDocumentOpts;
if pos != stdin.len() {
let error =
format!("should have read the entire stdin but only read to {pos} bytes");
return Err(io::Error::new(io::ErrorKind::Other, error));
}

if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::new())
!= Some(stdin.len())
{
let error = "scan_document with DEFAULT options should have read the entire stdin";
return Err(io::Error::new(io::ErrorKind::Other, error));
println!("Start Tag: {}", counters.start_tag);
println!("Empty Element Tag: {}", counters.empty_element_tag);
println!("End Tag: {}", counters.end_tag);
println!("Characters: {}", counters.chars);
println!("Processing Instruction: {}", counters.pi);
println!("Declaration: {}", counters.decl);
println!("Comment: {}", counters.comment);
println!("Cdata: {}", counters.cdata);
}

if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::relaxed())
!= Some(stdin.len())
{
let error = "scan_document with RELAXED options should have read the entire stdin";
return Err(io::Error::new(io::ErrorKind::Other, error));
Some(Cmd::VerifyStrictXml) => {
if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::new()) {
let error = "scan_document with DEFAULT options should have read the entire stdin";
return Err(io::Error::new(io::ErrorKind::Other, error));
}
}

if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::assume_valid_xml())
!= Some(stdin.len())
{
let error =
"scan_document with ASSUME VALID XML options should have read the entire stdin";
return Err(io::Error::new(io::ErrorKind::Other, error));
Some(Cmd::VerifyRelaxed) => {
if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::relaxed()) {
let error = "scan_document with RELAXED options should have read the entire stdin";
return Err(io::Error::new(io::ErrorKind::Other, error));
}
}
Some(Cmd::VerifyAssumeXml) => {
if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::assume_valid_xml()) {
let error =
"scan_document with ASSUME VALID XML options should have read the entire stdin";
return Err(io::Error::new(io::ErrorKind::Other, error));
}
}
}

println!("Start Tag: {}", counters.start_tag);
println!("Empty Element Tag: {}", counters.empty_element_tag);
println!("End Tag: {}", counters.end_tag);
println!("Characters: {}", counters.chars);
println!("Processing Instruction: {}", counters.pi);
println!("Declaration: {}", counters.decl);
println!("Comment: {}", counters.comment);
println!("Cdata: {}", counters.cdata);
println!("OK");

Ok(())
}
15 changes: 10 additions & 5 deletions examples/tokencount/testing.nu
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,22 @@
#
# Example:
#
# maybe_xml_find_docs "DOCTYPE" 50 | maybe_xml_tc
# maybe_xml_find_docs "DOCTYPE" 50 | maybe_xml_tc | select url tc.exit_code strict.exit_code relaxed.exit_code assume.exit_code | where strict_exit_code != 0 or relaxed_exit_code != 0 or assume_exit_code != 0

def maybe_xml_tc [] {
each { |it|
let tc = do { http get $it --raw | tokencount } | complete
let text = http get $it --raw
let tc = do { $text | tokencount count } | complete
let strict = do { $text | tokencount verify-strict-xml } | complete
let relaxed = do { $text | tokencount verify-relaxed } | complete
let assume = do { $text | tokencount verify-assume-xml } | complete

{
url: $it,
tc_exit_code: $tc.exit_code,
tc_stderr: $tc.stderr,
tc_stdout: $tc.stdout,
tc: $tc
strict: $strict
relaxed: $relaxed
assume: $assume
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions maybe_xml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ const fn is_utf8_boundary(byte: u8) -> bool {
byte as i8 >= -0x40
}

#[cfg(feature = "internal_unstable")]
#[cfg(any(test, feature = "internal_unstable"))]
pub use read::parser::scan_document;

#[cfg(feature = "internal_unstable")]
#[cfg(any(test, feature = "internal_unstable"))]
pub use read::parser::ScanDocumentOpts;

0 comments on commit 1ce384b

Please sign in to comment.