Add options to tokencount example

bluk · Dec 16, 2023 · 1ce384b · 1ce384b
1 parent 6d68a74
commit 1ce384b
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 78 deletions.
diff --git a/examples/tokencount/Cargo.toml b/examples/tokencount/Cargo.toml
@@ -5,11 +5,6 @@ name = "tokencount"
 publish = false
 version = "0.0.0"
 
-[features]
-default = ["internal_unstable"]
-
-# Exposing internal and unstable APIs
-internal_unstable = ["maybe_xml/internal_unstable"]
-
 [dependencies]
-maybe_xml = { path = "../../maybe_xml" }
+clap = { version = "4.4.11", features = ["derive"] }
+maybe_xml = { path = "../../maybe_xml", features = ["internal_unstable"] }
diff --git a/examples/tokencount/src/main.rs b/examples/tokencount/src/main.rs
@@ -1,5 +1,8 @@
 use std::io;
 
+use clap::{Parser, Subcommand};
+use maybe_xml::ScanDocumentOpts;
+
 use maybe_xml::{token, Reader};
 
 #[derive(Debug, Default)]
@@ -14,84 +17,121 @@ struct TokenCounters {
     cdata: usize,
 }
 
+#[derive(Debug, Parser)]
+struct Args {
+    #[command(subcommand)]
+    cmd: Option<Cmd>,
+}
+
+#[derive(Debug, Subcommand)]
+enum Cmd {
+    Count,
+    VerifyStrictXml,
+    VerifyRelaxed,
+    VerifyAssumeXml,
+}
+
+#[inline]
+#[must_use]
+fn is_utf8_bom(input: &[u8]) -> bool {
+    if input.len() < 4 {
+        return false;
+    }
+
+    input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF
+}
+
+#[inline]
+#[must_use]
+fn complete_scan(input: &[u8], opts: ScanDocumentOpts) -> bool {
+    let input = if is_utf8_bom(input) {
+        &input[3..]
+    } else {
+        input
+    };
+
+    maybe_xml::scan_document(input, 0, opts) == Some(input.len())
+}
+
 fn main() -> io::Result<()> {
+    let args = Args::parse();
+
     let stdin = io::read_to_string(io::stdin())?;
 
-    let mut counters = TokenCounters::default();
+    match args.cmd {
+        Some(Cmd::Count) | None => {
+            let mut counters = TokenCounters::default();
 
-    let reader = Reader::from_str(&stdin);
+            let reader = Reader::from_str(&stdin);
 
-    let mut pos = 0;
+            let mut pos = 0;
 
-    while let Some(token) = reader.tokenize(&mut pos) {
-        match token.ty() {
-            token::Ty::StartTag(_) => {
-                counters.start_tag += 1;
-            }
-            token::Ty::EmptyElementTag(_) => {
-                counters.empty_element_tag += 1;
-            }
-            token::Ty::EndTag(_) => {
-                counters.end_tag += 1;
-            }
-            token::Ty::Characters(_) => {
-                counters.chars += 1;
+            while let Some(token) = reader.tokenize(&mut pos) {
+                match token.ty() {
+                    token::Ty::StartTag(_) => {
+                        counters.start_tag += 1;
+                    }
+                    token::Ty::EmptyElementTag(_) => {
+                        counters.empty_element_tag += 1;
+                    }
+                    token::Ty::EndTag(_) => {
+                        counters.end_tag += 1;
+                    }
+                    token::Ty::Characters(_) => {
+                        counters.chars += 1;
+                    }
+                    token::Ty::ProcessingInstruction(_) => {
+                        counters.pi += 1;
+                    }
+                    token::Ty::Declaration(_) => {
+                        counters.decl += 1;
+                    }
+                    token::Ty::Comment(_) => {
+                        counters.comment += 1;
+                    }
+                    token::Ty::Cdata(_) => {
+                        counters.cdata += 1;
+                    }
+                }
             }
-            token::Ty::ProcessingInstruction(_) => {
-                counters.pi += 1;
-            }
-            token::Ty::Declaration(_) => {
-                counters.decl += 1;
-            }
-            token::Ty::Comment(_) => {
-                counters.comment += 1;
-            }
-            token::Ty::Cdata(_) => {
-                counters.cdata += 1;
-            }
-        }
-    }
-
-    if pos != stdin.len() {
-        let error = format!("should have read the entire stdin but only read to {pos} bytes");
-        return Err(io::Error::new(io::ErrorKind::Other, error));
-    }
 
-    #[cfg(feature = "internal_unstable")]
-    {
-        use maybe_xml::ScanDocumentOpts;
+            if pos != stdin.len() {
+                let error =
+                    format!("should have read the entire stdin but only read to {pos} bytes");
+                return Err(io::Error::new(io::ErrorKind::Other, error));
+            }
 
-        if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::new())
-            != Some(stdin.len())
-        {
-            let error = "scan_document with DEFAULT options should have read the entire stdin";
-            return Err(io::Error::new(io::ErrorKind::Other, error));
+            println!("Start Tag: {}", counters.start_tag);
+            println!("Empty Element Tag: {}", counters.empty_element_tag);
+            println!("End Tag: {}", counters.end_tag);
+            println!("Characters: {}", counters.chars);
+            println!("Processing Instruction: {}", counters.pi);
+            println!("Declaration: {}", counters.decl);
+            println!("Comment: {}", counters.comment);
+            println!("Cdata: {}", counters.cdata);
         }
-
-        if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::relaxed())
-            != Some(stdin.len())
-        {
-            let error = "scan_document with RELAXED options should have read the entire stdin";
-            return Err(io::Error::new(io::ErrorKind::Other, error));
+        Some(Cmd::VerifyStrictXml) => {
+            if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::new()) {
+                let error = "scan_document with DEFAULT options should have read the entire stdin";
+                return Err(io::Error::new(io::ErrorKind::Other, error));
+            }
         }
-
-        if maybe_xml::scan_document(stdin.as_bytes(), 0, ScanDocumentOpts::assume_valid_xml())
-            != Some(stdin.len())
-        {
-            let error =
-                "scan_document with ASSUME VALID XML options should have read the entire stdin";
-            return Err(io::Error::new(io::ErrorKind::Other, error));
+        Some(Cmd::VerifyRelaxed) => {
+            if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::relaxed()) {
+                let error = "scan_document with RELAXED options should have read the entire stdin";
+                return Err(io::Error::new(io::ErrorKind::Other, error));
+            }
+        }
+        Some(Cmd::VerifyAssumeXml) => {
+            if !complete_scan(stdin.as_bytes(), ScanDocumentOpts::assume_valid_xml()) {
+                let error =
+                    "scan_document with ASSUME VALID XML options should have read the entire stdin";
+                return Err(io::Error::new(io::ErrorKind::Other, error));
+            }
         }
     }
 
-    println!("Start Tag: {}", counters.start_tag);
-    println!("Empty Element Tag: {}", counters.empty_element_tag);
-    println!("End Tag: {}", counters.end_tag);
-    println!("Characters: {}", counters.chars);
-    println!("Processing Instruction: {}", counters.pi);
-    println!("Declaration: {}", counters.decl);
-    println!("Comment: {}", counters.comment);
-    println!("Cdata: {}", counters.cdata);
+    println!("OK");
 
     Ok(())
 }
diff --git a/examples/tokencount/testing.nu b/examples/tokencount/testing.nu
@@ -9,17 +9,22 @@
 #
 # Example:
 #
-# maybe_xml_find_docs "DOCTYPE" 50 | maybe_xml_tc
+# maybe_xml_find_docs "DOCTYPE" 50 | maybe_xml_tc | select url tc.exit_code strict.exit_code relaxed.exit_code assume.exit_code | where strict_exit_code != 0 or relaxed_exit_code != 0 or assume_exit_code != 0
 
 def maybe_xml_tc [] {
   each { |it|
-    let tc = do { http get $it --raw | tokencount } | complete
+    let text = http get $it --raw
+    let tc = do { $text | tokencount count } | complete
+    let strict = do { $text | tokencount verify-strict-xml } | complete
+    let relaxed = do { $text | tokencount verify-relaxed } | complete
+    let assume = do { $text |  tokencount verify-assume-xml } | complete
 
     {
       url: $it,
-      tc_exit_code: $tc.exit_code,
-      tc_stderr: $tc.stderr,
-      tc_stdout: $tc.stdout,
+      tc: $tc
+      strict: $strict
+      relaxed: $relaxed
+      assume: $assume
     }
   }
 }

diff --git a/maybe_xml/src/lib.rs b/maybe_xml/src/lib.rs
@@ -122,8 +122,8 @@ const fn is_utf8_boundary(byte: u8) -> bool {
     byte as i8 >= -0x40
 }
 
-#[cfg(feature = "internal_unstable")]
+#[cfg(any(test, feature = "internal_unstable"))]
 pub use read::parser::scan_document;
 
-#[cfg(feature = "internal_unstable")]
+#[cfg(any(test, feature = "internal_unstable"))]
 pub use read::parser::ScanDocumentOpts;