Restore panic if pos is greater than input

- Restore previous behavior for errant programs - Use safe conversion to ensure str is UTF-8 in cases where the pos is not on a valid UTF-8 boundary
bluk · Nov 13, 2023 · 82f5767 · 82f5767
1 parent 59821e4
commit 82f5767
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 20 deletions.
diff --git a/maybe_xml/src/lexer.rs b/maybe_xml/src/lexer.rs
@@ -140,6 +140,10 @@ impl<'a> Lexer<'a> {
     ///
     /// If a token is found, the position is also updated to after the token.
     ///
+    /// # Panics
+    ///
+    /// Panics if the `pos` is greater than the input length.
+    ///
     /// # Examples
     ///
     /// ```
@@ -158,23 +162,20 @@ impl<'a> Lexer<'a> {
     ///```
     #[must_use]
     pub fn tokenize(&self, pos: &mut usize) -> Option<Token<'a>> {
-        if let Some(end) = scan(self.input, *pos) {
-            // This is a convoluted but *const* way of getting &self.input[*pos..end]
-            let (bytes, _) = self.input.split_at(end);
-            let (_, bytes) = bytes.split_at(*pos);
-            let token = Token::from_str(unsafe { core::str::from_utf8_unchecked(bytes) });
-
-            *pos = end;
-            Some(token)
-        } else {
-            None
-        }
+        let end = scan(self.input, *pos)?;
+        let token = Token::from_str(core::str::from_utf8(&self.input[*pos..end]).ok()?);
+        *pos = end;
+        Some(token)
     }
 
     /// Constant function which tokenizes the input starting at the given position.
     ///
     /// If a token is found, the position is also updated to after the token.
     ///
+    /// # Panics
+    ///
+    /// Panics if the `pos` is greater than the input length.
+    ///
     /// # Examples
     ///
     /// ```
@@ -198,8 +199,12 @@ impl<'a> Lexer<'a> {
             // This is a convoluted but *const* way of getting &self.input[*pos..end]
             let (bytes, _) = self.input.split_at(end);
             let (_, bytes) = bytes.split_at(pos);
-            let token = Token::from_str(unsafe { core::str::from_utf8_unchecked(bytes) });
-            Some(token)
+            if let Ok(s) = core::str::from_utf8(bytes) {
+                let token = Token::from_str(s);
+                Some(token)
+            } else {
+                None
+            }
         } else {
             None
         }
@@ -463,19 +468,19 @@ mod tests {
     }
 
     #[test]
-    fn pos_greater_than_slice_len() {
+    #[should_panic(expected = "out of bounds")]
+    fn panic_on_pos_greater_than_slice_len() {
         let lexer = Lexer::from_str("");
         let mut pos = 1;
-        assert_eq!(None, lexer.tokenize(&mut pos));
-        assert_eq!(pos, 1);
+        let _ = lexer.tokenize(&mut pos);
     }
 
     #[test]
-    fn pos_greater_than_slice_len_2() {
+    #[should_panic(expected = "out of bounds")]
+    fn panic_on_pos_greater_than_slice_len_2() {
         let lexer = Lexer::from_str("hello");
         let mut pos = "hello".len() + 1;
-        assert_eq!(None, lexer.tokenize(&mut pos));
-        assert_eq!(pos, "hello".len() + 1);
+        let _ = lexer.tokenize(&mut pos);
     }
 
     #[cfg(any(feature = "std", feature = "alloc"))]

diff --git a/maybe_xml/src/lexer/scanner.rs b/maybe_xml/src/lexer/scanner.rs
@@ -348,7 +348,7 @@ const fn scan_cdata(input: &[u8]) -> Option<usize> {
 #[inline]
 #[must_use]
 pub(super) const fn scan(input: &[u8], pos: usize) -> Option<usize> {
-    if input.len() <= pos {
+    if input.len() == pos {
         return None;
     }