From daff30c4929d086823e11c9bd228febb9cfd3737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arb=C3=ABr=20Shabhasa?= Date: Thu, 4 Jul 2024 06:57:43 +0200 Subject: [PATCH] Handle closing tag with whitespace (#128) --- lib/saxy/parser/builder.ex | 29 +++++++++++++++++------------ test/saxy_test.exs | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/lib/saxy/parser/builder.ex b/lib/saxy/parser/builder.ex index 2e7d7ad..b11ad28 100644 --- a/lib/saxy/parser/builder.ex +++ b/lib/saxy/parser/builder.ex @@ -851,7 +851,7 @@ defmodule Saxy.Parser.Builder do open_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint)) "/" <> rest -> - close_tag_name(rest, more?, original, pos + 1, state, 0) + close_tag_name(rest, more?, original, pos + 1, state, 0, 0) "![CDATA[" <> rest -> element_cdata(rest, more?, original, pos + 8, state, 0) @@ -1161,30 +1161,31 @@ defmodule Saxy.Parser.Builder do end end - defp close_tag_name(<>, more?, original, pos, state, 0) do + defp close_tag_name(<>, more?, original, pos, state, 0, 0) do lookahead buffer, @streaming do char <> rest when is_ascii_name_start_char(char) -> - close_tag_name(rest, more?, original, pos, state, 1) + close_tag_name(rest, more?, original, pos, state, 1, 1) token in unquote(utf8_binaries()) when more? -> - halt!(close_tag_name(token, more?, original, pos, state, 0)) + halt!(close_tag_name(token, more?, original, pos, state, 0, 0)) <> <> rest when is_utf8_name_start_char(codepoint) -> - close_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint)) + len = Utils.compute_char_len(codepoint) + close_tag_name(rest, more?, original, pos, state, len, len) _ in [""] when more? -> - halt!(close_tag_name("", more?, original, pos, state, 0)) + halt!(close_tag_name("", more?, original, pos, state, 0, 0)) _ -> Utils.parse_error(original, pos, state, {:token, :end_tag}) end end - defp close_tag_name(<>, more?, original, pos, state, len) do + defp close_tag_name(<>, more?, original, pos, state, len, copy_to) do lookahead buffer, @streaming do ">" <> rest -> [open_tag | stack] = state.stack - ending_tag = binary_part(original, pos, len) + ending_tag = binary_part(original, pos, copy_to) pos = pos + len + 1 if open_tag == ending_tag do @@ -1205,16 +1206,20 @@ defmodule Saxy.Parser.Builder do end char <> rest when is_ascii_name_char(char) -> - close_tag_name(rest, more?, original, pos, state, len + 1) + close_tag_name(rest, more?, original, pos, state, len + 1, copy_to + 1) + + char <> rest when is_whitespace(char) -> + close_tag_name(rest, more?, original, pos, state, len + 1, copy_to) token in unquote(utf8_binaries()) when more? -> - halt!(close_tag_name(token, more?, original, pos, state, len)) + halt!(close_tag_name(token, more?, original, pos, state, len, copy_to)) <> <> rest when is_utf8_name_char(codepoint) -> - close_tag_name(rest, more?, original, pos, state, len + Utils.compute_char_len(codepoint)) + char_len = Utils.compute_char_len(codepoint) + close_tag_name(rest, more?, original, pos, state, len + char_len, copy_to + char_len) _ in [""] when more? -> - halt!(close_tag_name("", more?, original, pos, state, len)) + halt!(close_tag_name("", more?, original, pos, state, len, copy_to)) _ -> Utils.parse_error(original, pos + len, state, {:token, :end_tag}) diff --git a/test/saxy_test.exs b/test/saxy_test.exs index 4242b75..981b662 100644 --- a/test/saxy_test.exs +++ b/test/saxy_test.exs @@ -73,6 +73,20 @@ defmodule SaxyTest do ] end + test "parse_string/4 parses XML binary with closing tags containing whitespaces" do + data = "Some data" + + assert {:ok, state} = parse(data, StackHandler, []) + + assert state == [ + end_document: {}, + end_element: "foo", + characters: "Some data", + start_element: {"foo", []}, + start_document: [] + ] + end + test "handles trailing Unicode codepoints during streaming" do data = "𠜎𠜱𠝹𠱓" stream = for byte <- :binary.bin_to_list(data), do: <> @@ -134,6 +148,10 @@ defmodule SaxyTest do data = "" assert {:error, exception} = parse(data, StackHandler, []) assert Exception.message(exception) == "unexpected ending tag \"bee\", expected tag: \"bar\"" + + data = "Some data" + assert {:error, exception} = parse(data, StackHandler, []) + assert Exception.message(exception) == "unexpected ending tag \"foo \", expected tag: \"foo\"" end describe "encode!/2" do