From 7ee8853f3148cb61f7b3faec5c43275f36fb5659 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 16 Jan 2025 14:55:39 +0200 Subject: [PATCH] xml detect enchancement (#659) * xml detect enchancement * Find matched closing tag * xml spec case --- credsweeper/deep_scanner/deep_scanner.py | 11 ++++---- credsweeper/utils/util.py | 33 +++++++++++++++-------- tests/utils/test_util.py | 34 ++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 17 deletions(-) diff --git a/credsweeper/deep_scanner/deep_scanner.py b/credsweeper/deep_scanner/deep_scanner.py index 808ea76dd..dbc006d7c 100644 --- a/credsweeper/deep_scanner/deep_scanner.py +++ b/credsweeper/deep_scanner/deep_scanner.py @@ -102,16 +102,15 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]: elif file_type in [".eml", ".mht"]: if Util.is_eml(data): deep_scanners.append(EmlScanner) - elif Util.is_html(data): + elif Util.is_xml(data) and Util.is_html(data): deep_scanners.append(HtmlScanner) else: deep_scanners.append(ByteScanner) - elif Util.is_html(data): - deep_scanners.append(HtmlScanner) - deep_scanners.append(XmlScanner) - elif Util.is_mxfile(data): - deep_scanners.append(MxfileScanner) elif Util.is_xml(data): + if Util.is_html(data): + deep_scanners.append(HtmlScanner) + elif Util.is_mxfile(data): + deep_scanners.append(MxfileScanner) deep_scanners.append(XmlScanner) else: deep_scanners = [EncoderScanner, LangScanner, ByteScanner] diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py index 4f9f49436..1e45fe3fc 100644 --- a/credsweeper/utils/util.py +++ b/credsweeper/utils/util.py @@ -459,29 +459,40 @@ def is_elf(data: Union[bytes, bytearray]) -> bool: @staticmethod def is_html(data: Union[bytes, bytearray]) -> bool: - """Used to detect html format of eml""" + """Used to detect html format. Suppose, invocation of is_xml() was True before.""" if isinstance(data, (bytes, bytearray)): - if 0 <= data.find(b"" in data: - return True + for opening_tag, closing_tag in [(b"", b""), (b""), (b"

", b"

"), + (b"", b""), (b"
", b"
"), (b"
  • ", b"
  • "), + (b"
      ", b"
    "), (b"
      ", b"
    "), (b"", b""), + (b"", b""), (b"", b"")]: + opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH) + if 0 <= opening_pos < data.find(closing_tag, opening_pos): + # opening and closing tags were found - suppose it is an HTML + return True return False @staticmethod def is_mxfile(data: Union[bytes, bytearray]) -> bool: - """Used to detect mxfile format""" + """Used to detect mxfile (drawio) format. Suppose, invocation of is_xml() was True before.""" if isinstance(data, (bytes, bytearray)): - if 0 <= data.find(b"" in data: + mxfile_tag_pos = data.find(b"", mxfile_tag_pos): return True return False - XML_CLOSE_TAG_PATTERN = re.compile(rb"") + # A well-formed XML must start from < or a whitespace character + XML_FIRST_BRACKET_PATTERN = re.compile(rb"^\s*<") + XML_OPENING_TAG_PATTERN = re.compile(rb"<([0-9A-Za-z_]{1,256})") @staticmethod def is_xml(data: Union[bytes, bytearray]) -> bool: - """Used to detect xml format""" - if isinstance(data, (bytes, bytearray)): - start = data.find(b'<', 0, CHUNK_SIZE) - if 0 <= start and 0 <= data.find(b'>', start + 1, CHUNK_SIZE): - return bool(re.search(Util.XML_CLOSE_TAG_PATTERN, data)) + """Used to detect xml format from raw bytes""" + if isinstance(data, (bytes, bytearray)) and Util.XML_FIRST_BRACKET_PATTERN.search(data, 0, MAX_LINE_LENGTH): + if first_bracket_match := Util.XML_OPENING_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH): + start_pos = first_bracket_match.start() + closing_tag = b"" + if start_pos < data.find(closing_tag, start_pos): + return True return False @staticmethod diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py index f7dc4d5bb..a4af145f3 100644 --- a/tests/utils/test_util.py +++ b/tests/utils/test_util.py @@ -616,3 +616,37 @@ def test_subtext_p(self): self.assertEqual("the lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 6)) self.assertEqual(AZ_STRING[:39], Util.subtext(AZ_STRING, 15, 20)) self.assertEqual(AZ_STRING[-40:], Util.subtext(AZ_STRING, 33, 20)) + + def test_is_xml_n(self): + self.assertFalse(Util.is_xml(b'')) + self.assertFalse(Util.is_xml(b"!<>")) + self.assertFalse(Util.is_xml(b"")) + self.assertFalse(Util.is_xml(b"

    ")) + self.assertFalse(Util.is_xml(b"
    ")) + self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b" far far away"))) + self.assertFalse(Util.is_xml(b" unmatched tags ")) + self.assertFalse(Util.is_xml(b"")) + + def test_is_html_n(self): + self.assertFalse(Util.is_html(b"")) + + def test_is_mxfile_n(self): + self.assertFalse(Util.is_mxfile(b"")) + self.assertFalse(Util.is_mxfile(b"")) + + def test_xml_n(self): + self.assertFalse(Util.is_xml(None)) + self.assertFalse(Util.is_xml('')) + self.assertFalse(Util.is_html(None)) + self.assertFalse(Util.is_html(None)) + + def test_xml_p(self): + self.assertTrue(Util.is_xml(b" matched tags ")) + data = b"
    " + self.assertTrue(Util.is_xml(data)) + self.assertTrue(Util.is_html(data)) + self.assertTrue(Util.is_mxfile(data)) + self.assertTrue( + Util.is_xml( + bytearray(b'\n far far away ') + bytearray(b'\n' * MAX_LINE_LENGTH) + + bytearray(b' long long ago ')))