From 7ee8853f3148cb61f7b3faec5c43275f36fb5659 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:55:39 +0200
Subject: [PATCH] xml detect enchancement (#659)

* xml detect enchancement

* Find matched closing tag

* xml spec case
---
 credsweeper/deep_scanner/deep_scanner.py | 11 ++++----
 credsweeper/utils/util.py                | 33 +++++++++++++++--------
 tests/utils/test_util.py                 | 34 ++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 17 deletions(-)
diff --git a/credsweeper/deep_scanner/deep_scanner.py b/credsweeper/deep_scanner/deep_scanner.py
index 808ea76dd..dbc006d7c 100644
--- a/credsweeper/deep_scanner/deep_scanner.py
+++ b/credsweeper/deep_scanner/deep_scanner.py
@@ -102,16 +102,15 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
         elif file_type in [".eml", ".mht"]:
             if Util.is_eml(data):
                 deep_scanners.append(EmlScanner)
-            elif Util.is_html(data):
+            elif Util.is_xml(data) and Util.is_html(data):
                 deep_scanners.append(HtmlScanner)
             else:
                 deep_scanners.append(ByteScanner)
-        elif Util.is_html(data):
-            deep_scanners.append(HtmlScanner)
-            deep_scanners.append(XmlScanner)
-        elif Util.is_mxfile(data):
-            deep_scanners.append(MxfileScanner)
         elif Util.is_xml(data):
+            if Util.is_html(data):
+                deep_scanners.append(HtmlScanner)
+            elif Util.is_mxfile(data):
+                deep_scanners.append(MxfileScanner)
             deep_scanners.append(XmlScanner)
         else:
             deep_scanners = [EncoderScanner, LangScanner, ByteScanner]
diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
index 4f9f49436..1e45fe3fc 100644
--- a/credsweeper/utils/util.py
+++ b/credsweeper/utils/util.py
@@ -459,29 +459,40 @@ def is_elf(data: Union[bytes, bytearray]) -> bool:
 
     @staticmethod
     def is_html(data: Union[bytes, bytearray]) -> bool:
-        """Used to detect html format of eml"""
+        """Used to detect html format. Suppose, invocation of is_xml() was True before."""
         if isinstance(data, (bytes, bytearray)):
-            if 0 <= data.find(b"<html", 0, CHUNK_SIZE) and b"</html>" in data:
-                return True
+            for opening_tag, closing_tag in [(b"<html>", b"</html>"), (b"<table", b"</table>"), (b"<p>", b"</p>"),
+                                             (b"<span>", b"</span>"), (b"<div>", b"</div>"), (b"<li>", b"</li>"),
+                                             (b"<ol>", b"</ol>"), (b"<ul>", b"</ul>"), (b"<th>", b"</th>"),
+                                             (b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
+                opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
+                if 0 <= opening_pos < data.find(closing_tag, opening_pos):
+                    # opening and closing tags were found - suppose it is an HTML
+                    return True
         return False
 
     @staticmethod
     def is_mxfile(data: Union[bytes, bytearray]) -> bool:
-        """Used to detect mxfile format"""
+        """Used to detect mxfile (drawio) format. Suppose, invocation of is_xml() was True before."""
         if isinstance(data, (bytes, bytearray)):
-            if 0 <= data.find(b"<mxfile", 0, CHUNK_SIZE) and b"</mxfile>" in data:
+            mxfile_tag_pos = data.find(b"<mxfile", 0, MAX_LINE_LENGTH)
+            if 0 <= mxfile_tag_pos < data.find(b"</mxfile>", mxfile_tag_pos):
                 return True
         return False
 
-    XML_CLOSE_TAG_PATTERN = re.compile(rb"</[0-9A-Za-z_]{1,80}>")
+    # A well-formed XML must start from < or a whitespace character
+    XML_FIRST_BRACKET_PATTERN = re.compile(rb"^\s*<")
+    XML_OPENING_TAG_PATTERN = re.compile(rb"<([0-9A-Za-z_]{1,256})")
 
     @staticmethod
     def is_xml(data: Union[bytes, bytearray]) -> bool:
-        """Used to detect xml format"""
-        if isinstance(data, (bytes, bytearray)):
-            start = data.find(b'<', 0, CHUNK_SIZE)
-            if 0 <= start and 0 <= data.find(b'>', start + 1, CHUNK_SIZE):
-                return bool(re.search(Util.XML_CLOSE_TAG_PATTERN, data))
+        """Used to detect xml format from raw bytes"""
+        if isinstance(data, (bytes, bytearray)) and Util.XML_FIRST_BRACKET_PATTERN.search(data, 0, MAX_LINE_LENGTH):
+            if first_bracket_match := Util.XML_OPENING_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH):
+                start_pos = first_bracket_match.start()
+                closing_tag = b"</" + first_bracket_match.group(1) + b">"
+                if start_pos < data.find(closing_tag, start_pos):
+                    return True
         return False
 
     @staticmethod
diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py
index f7dc4d5bb..a4af145f3 100644
--- a/tests/utils/test_util.py
+++ b/tests/utils/test_util.py
@@ -616,3 +616,37 @@ def test_subtext_p(self):
         self.assertEqual("the lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 6))
         self.assertEqual(AZ_STRING[:39], Util.subtext(AZ_STRING, 15, 20))
         self.assertEqual(AZ_STRING[-40:], Util.subtext(AZ_STRING, 33, 20))
+
+    def test_is_xml_n(self):
+        self.assertFalse(Util.is_xml(b''))
+        self.assertFalse(Util.is_xml(b"!<>"))
+        self.assertFalse(Util.is_xml(b"</onlyClosingTagIsFail>"))
+        self.assertFalse(Util.is_xml(b"</p><p>"))
+        self.assertFalse(Util.is_xml(b"<br />"))
+        self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b"    <xml>far far away</xml>")))
+        self.assertFalse(Util.is_xml(b"<html> unmatched tags </xml>"))
+        self.assertFalse(Util.is_xml(b"<?xml version='1.0' encoding='utf-8'?>"))
+
+    def test_is_html_n(self):
+        self.assertFalse(Util.is_html(b"</html><html>"))
+
+    def test_is_mxfile_n(self):
+        self.assertFalse(Util.is_mxfile(b"<mxfile>"))
+        self.assertFalse(Util.is_mxfile(b"</mxfile><mxfile>"))
+
+    def test_xml_n(self):
+        self.assertFalse(Util.is_xml(None))
+        self.assertFalse(Util.is_xml(''))
+        self.assertFalse(Util.is_html(None))
+        self.assertFalse(Util.is_html(None))
+
+    def test_xml_p(self):
+        self.assertTrue(Util.is_xml(b"<?xml version='1.0' encoding='utf-8'?><xml> matched tags </xml>"))
+        data = b"<mxfile atr=0><table></table></mxfile>"
+        self.assertTrue(Util.is_xml(data))
+        self.assertTrue(Util.is_html(data))
+        self.assertTrue(Util.is_mxfile(data))
+        self.assertTrue(
+            Util.is_xml(
+                bytearray(b'\n<xml> far far away ') + bytearray(b'\n' * MAX_LINE_LENGTH) +
+                bytearray(b' long long ago </xml>')))