diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 94f4aaecfc61b3..938850de48951e 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -29,15 +29,43 @@ piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: -# 1) if you change tagfind/attrfind remember to update locatestarttagend too; -# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will +# 1) if you change tagfind/attrfind remember to update locatetagend too; +# 2) if you change tagfind/attrfind and/or locatetagend the parser will # explode, so don't do it. -# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state -# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state -tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') -attrfind_tolerant = re.compile( - r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') +# see the HTML5 specs section "13.2.5.6 Tag open state", +# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state". +# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state +# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state +# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state +tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*') +attrfind_tolerant = re.compile(r""" + ( + (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name + ) + (= # value indicator + ('[^']*' # LITA-enclosed value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\t\n\r\f ]* # bare value + ) + )? + (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space +""", re.VERBOSE) +locatetagend = re.compile(r""" + [a-zA-Z][^\t\n\r\f />]* # tag name + [\t\n\r\f /]* # optional whitespace before attribute name + (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name + (?:= # value indicator + (?:'[^']*' # LITA-enclosed value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\t\n\r\f ]* # bare value + ) + )? + [\t\n\r\f /]* # possibly followed by a space + )* + >? +""", re.VERBOSE) +# The following variables are not used, but are temporarily left for +# backward compatibility. locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name @@ -54,8 +82,6 @@ \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') -# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between -# ') @@ -122,7 +148,8 @@ def get_starttag_text(self): def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) + self.interesting = re.compile(r'])' % self.cdata_elem, + re.IGNORECASE|re.ASCII) def clear_cdata_mode(self): self.interesting = interesting_normal @@ -147,7 +174,7 @@ def goahead(self, end): # & near the end and see if it's followed by a space or ;. amppos = rawdata.rfind('&', max(i, n-34)) if (amppos >= 0 and - not re.compile(r'[\s;]').search(rawdata, amppos)): + not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)): break # wait till we get all the text j = n else: @@ -260,7 +287,7 @@ def goahead(self, end): else: assert 0, "interesting.search() lied" # end while - if end and i < n and not self.cdata_elem: + if end and i < n: if self.convert_charrefs and not self.cdata_elem: self.handle_data(unescape(rawdata[i:n])) else: @@ -291,7 +318,7 @@ def parse_html_declaration(self, i): return self.parse_bogus_comment(i) # Internal -- parse bogus comment, return length or -1 if not terminated - # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + # see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata assert rawdata[i:i+2] in ('": - return j + 1 - if next == "/": - if rawdata.startswith("/>", j): - return j + 2 - if rawdata.startswith("/", j): - # buffer boundary - return -1 - # else bogus input - if j > i: - return j - else: - return i + 1 - if next == "": - # end of input - return -1 - if next in ("abcdefghijklmnopqrstuvwxyz=/" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): - # end of input in or before attribute value, or we have the - # '/' from a '/>' ending - return -1 - if j > i: - return j - else: - return i + 1 - raise AssertionError("we should not get here!") + match = locatetagend.match(rawdata, i+1) + assert match + j = match.end() + if rawdata[j-1] != ">": + return -1 + return j # Internal -- parse endtag, return end or -1 if incomplete def parse_endtag(self, i): + # See the HTML5 specs section "13.2.5.7 End tag open state" + # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state rawdata = self.rawdata assert rawdata[i:i+2] == " - if not match: + if rawdata.find('>', i+2) < 0: # fast check return -1 - gtpos = match.end() - match = endtagfind.match(rawdata, i) # - if not match: - if self.cdata_elem is not None: - self.handle_data(rawdata[i:gtpos]) - return gtpos - # find the name: w3.org/TR/html5/tokenization.html#tag-name-state - namematch = tagfind_tolerant.match(rawdata, i+2) - if not namematch: - # w3.org/TR/html5/tokenization.html#end-tag-open-state - if rawdata[i:i+3] == '': - return i+3 - else: - return self.parse_bogus_comment(i) - tagname = namematch.group(1).lower() - # consume and ignore other stuff between the name and the > - # Note: this is not 100% correct, since we might have things like - # , but looking for > after the name should cover - # most of the cases and is much simpler - gtpos = rawdata.find('>', namematch.end()) - self.handle_endtag(tagname) - return gtpos+1 + if not endtagopen.match(rawdata, i): # ': # is ignored + # "missing-end-tag-name" parser error + return i+3 + else: + return self.parse_bogus_comment(i) - elem = match.group(1).lower() # script or style - if self.cdata_elem is not None: - if elem != self.cdata_elem: - self.handle_data(rawdata[i:gtpos]) - return gtpos + match = locatetagend.match(rawdata, i+2) + assert match + j = match.end() + if rawdata[j-1] != ">": + return -1 - self.handle_endtag(elem) + # find the name: "13.2.5.8 Tag name state" + # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state + match = tagfind_tolerant.match(rawdata, i+2) + assert match + tag = match.group(1).lower() + self.handle_endtag(tag) self.clear_cdata_mode() - return gtpos + return j # Overridable -- finish processing of start+end tag: def handle_startendtag(self, tag, attrs): diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index b7cf1e28581ed9..8e9b2c308e5e16 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -718,6 +718,31 @@ def check_sizeof(test, o, size): % (type(o), result, size) test.assertEqual(result, size, msg) +def subTests(arg_names, arg_values, /, *, _do_cleanups=False): + """Run multiple subtests with different parameters. + """ + single_param = False + if isinstance(arg_names, str): + arg_names = arg_names.replace(',',' ').split() + if len(arg_names) == 1: + single_param = True + arg_values = tuple(arg_values) + def decorator(func): + if isinstance(func, type): + raise TypeError('subTests() can only decorate methods, not classes') + @functools.wraps(func) + def wrapper(self, /, *args, **kwargs): + for values in arg_values: + if single_param: + values = (values,) + subtest_kwargs = dict(zip(arg_names, values)) + with self.subTest(**subtest_kwargs): + func(self, *args, **kwargs, **subtest_kwargs) + if _do_cleanups: + self.doCleanups() + return wrapper + return decorator + #======================================================================= # Decorator for running a function in a different locale, correctly resetting # it afterwards. diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index df775c11310146..ea3d8bbc005825 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -80,6 +80,13 @@ def handle_entityref(self, data): self.fail('This should never be called with convert_charrefs=True') +# The normal event collector normalizes the events in get_events, +# so we override it to return the original list of events. +class EventCollectorNoNormalize(EventCollector): + def get_events(self): + return self.events + + class TestCaseBase(unittest.TestCase): def get_collector(self): @@ -264,8 +271,7 @@ def test_get_starttag_text(self): ("starttag", "foo:bar", [("one", "1"), ("two", "2")]), ("starttag_text", s)]) - def test_cdata_content(self): - contents = [ + @support.subTests('content', [ ' ¬-an-entity-ref;', "", '

', @@ -278,44 +284,83 @@ def test_cdata_content(self): 'src="http://www.example.org/r=\'+new ' 'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'), '\n\n', - 'foo = "";', '', - # these two should be invalid according to the HTML 5 spec, - # section 8.1.2.2 - #'foo = ', - #'foo = ', - ] - elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] - for content in contents: - for element in elements: - element_lower = element.lower() - s = '<{element}>{content}'.format(element=element, - content=content) - self._run_check(s, [("starttag", element_lower, []), - ("data", content), - ("endtag", element_lower)]) - - def test_cdata_with_closing_tags(self): + 'foo = ""', + 'foo = ""', + 'foo = ""', + 'foo = ""', + 'foo = ""', + 'foo = ""', + ]) + def test_script_content(self, content): + s = f'' + self._run_check(s, [("starttag", "script", []), + ("data", content), + ("endtag", "script")]) + + @support.subTests('content', [ + 'a::before { content: ""; }', + 'a::before { content: "¬-an-entity-ref;"; }', + 'a::before { content: ""; }', + 'a::before { content: "\u2603"; }', + 'a::before { content: "< /style>"; }', + 'a::before { content: ""; }', + 'a::before { content: ""; }', + 'a::before { content: ""; }', + 'a::before { content: ""; }', + 'a::before { content: ""; }', + ]) + def test_style_content(self, content): + s = f'' + self._run_check(s, [("starttag", "style", []), + ("data", content), + ("endtag", "style")]) + + @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', + 'script/', 'script foo=bar', 'script foo=">"']) + def test_script_closing_tag(self, endtag): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. - # The normal event collector normalizes the events in get_events, - # so we override it to return the original list of events. - class Collector(EventCollector): - def get_events(self): - return self.events - content = """ ¬-an-entity-ref;

''""" - for element in [' script', 'script ', ' script ', - '\nscript', 'script\n', '\nscript\n']: - element_lower = element.lower().strip() - s = '{content}{tail}' + self._run_check(s, [("starttag", "script", []), + ("data", content if end else content + tail)], + collector=EventCollectorNoNormalize(convert_charrefs=False)) def test_comments(self): html = ("" @@ -405,7 +450,7 @@ def test_starttag_junk_chars(self): self._run_check("", [('comment', '$')]) self._run_check("", [('endtag', 'a')]) + self._run_check("", [('comment', ' a')]) self._run_check("", [('starttag', 'a", [('endtag', 'a', [('endtag', 'a')]) + self._run_check('', [('endtag', 'a')]) + def test_declaration_junk_chars(self): self._run_check("", [('decl', 'DOCTYPE foo $ ')]) @@ -487,15 +536,11 @@ def test_invalid_end_tags(self): self._run_check(html, expected) def test_broken_invalid_end_tag(self): - # This is technically wrong (the "> shouldn't be included in the 'data') - # but is probably not worth fixing it (in addition to all the cases of - # the previous test, it would require a full attribute parsing). - # see #13993 html = 'This confuses the parser' expected = [('starttag', 'b', []), ('data', 'This'), ('endtag', 'b'), - ('data', '"> confuses the parser')] + ('data', ' confuses the parser')] self._run_check(html, expected) def test_correct_detection_of_start_tags(self): @@ -522,7 +567,7 @@ def test_correct_detection_of_start_tags(self): html = '

The rain' expected = [ - ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]), + ('starttag', 'div', [('style', ''), (',', None), ('foo', None), ('=', None), ('"bar"', None)]), ('starttag', 'b', []), ('data', 'The '), ('starttag', 'a', [('href', 'some_url')]), @@ -677,9 +722,15 @@ def test_attr_syntax(self): ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) ] self._run_check("""""", output) - self._run_check("""""", output) - self._run_check("""""", output) - self._run_check("""""", output) + self._run_check("", [('starttag', 'a', [('foo', '=bar')])]) + self._run_check("", [('starttag', 'a', [('foo', None), ('=bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo', None), ('=bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo\v', 'bar')])]) + self._run_check("", [('starttag', 'a', [('foo\xa0', 'bar')])]) + self._run_check("", [('starttag', 'a', [('foo', ''), ('bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo', ''), ('bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo', '\vbar')])]) + self._run_check("", [('starttag', 'a', [('foo', '\xa0bar')])]) def test_attr_values(self): self._run_check("""""", @@ -688,6 +739,10 @@ def test_attr_values(self): ("d", "\txyz\n")])]) self._run_check("""""", [("starttag", "a", [("b", ""), ("c", "")])]) + self._run_check("", + [("starttag", "a", [("b", ""), ("c", "")])]) + self._run_check("", + [("starttag", "a", [("b", "\v"), ("c", "\xa0")])]) # Regression test for SF patch #669683. self._run_check("", [("starttag", "e", [("a", "rgb(1,2,3)")])]) @@ -759,7 +814,7 @@ def test_malformed_attributes(self): ('data', 'test - bad2'), ('endtag', 'a'), ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), ('data', 'test - bad3'), ('endtag', 'a'), - ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), + ('starttag', 'a', [('href', None), ('=', None), ("test' style", 'color:red;bad4')]), ('data', 'test - bad4'), ('endtag', 'a') ] self._run_check(html, expected) diff --git a/Misc/NEWS.d/next/Security/2023-02-13-21-41-34.gh-issue-86155.ppIGSC.rst b/Misc/NEWS.d/next/Security/2023-02-13-21-41-34.gh-issue-86155.ppIGSC.rst new file mode 100644 index 00000000000000..bb85481b229697 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2023-02-13-21-41-34.gh-issue-86155.ppIGSC.rst @@ -0,0 +1,2 @@ +:meth:`html.parser.HTMLParser.close` no longer loses data when the +````. + +* Multiple slashes and whitespaces between the last attribute and closing ``>`` + are now ignored in both start and end tags. E.g. ````. + +* Multiple ``=`` between attribute name and value are no longer collapsed. + E.g. ```` produces attribute "foo" with value "=bar". + +* Whitespaces between the ``=`` separator and attribute name or value are no + longer ignored. E.g. ```` produces two attributes "foo" and + "=bar", both with value None; ```` produces two attributes: + "foo" with value "" and "bar" with value None.