From dce77bfd4664f1ab90f76d3c847224dbbfabe820 Mon Sep 17 00:00:00 2001 From: Hissssst <37012324+hissssst@users.noreply.github.com> Date: Sun, 7 May 2023 18:50:26 +0000 Subject: [PATCH] Saxy.stream_events implemented (#118) --- lib/saxy.ex | 96 ++++++++++++++++++++++++++++-- lib/saxy/handler/accumulating.ex | 11 ++++ mix.lock | 10 ++-- test/saxy_test.exs | 12 ++++ test/support/fixture/incorrect.xml | 8 +++ 5 files changed, 128 insertions(+), 9 deletions(-) create mode 100644 lib/saxy/handler/accumulating.ex create mode 100644 test/support/fixture/incorrect.xml diff --git a/lib/saxy.ex b/lib/saxy.ex index 79e0af5..00dd989 100644 --- a/lib/saxy.ex +++ b/lib/saxy.ex @@ -111,8 +111,11 @@ defmodule Saxy do """ + @compile {:inline, do_transform_stream: 4} + alias Saxy.{ Encoder, + Handler.Accumulating, Parser, State } @@ -309,10 +312,95 @@ defmodule Saxy do end defp reduce_stream(buffer, {cont_fun, state}) do - with {:halted, cont_fun, state} <- cont_fun.(buffer, true, state) do - {:cont, {cont_fun, state}} - else - other -> {:halt, other} + case cont_fun.(buffer, true, state) do + {:halted, cont_fun, state} -> + {:cont, {cont_fun, state}} + + other -> + {:halt, other} + end + end + + @doc """ + Parses XML stream and returns a stream of elements. + + This function takes a stream and returns a stream of xml SAX events. + When any parsing error occurs, it raises a `Saxy.ParseError` exception. + + + ## Examples + + iex> stream = File.stream!("./test/support/fixture/foo.xml") + iex> Enum.to_list Saxy.stream_events stream + [ + start_document: [version: "1.0"], + start_element: {"foo", [{"bar", "value"}]}, + end_element: "foo" + ] + iex> Enum.to_list Saxy.stream_events ["unclosed value"] + ** (Saxy.ParseError) unexpected end of input, expected token: :chardata + + > #### Warning {: .warning } + > + > Input stream is evaluated lazily, therefore some events may be emitted before + > exception is raised + + ## Memory usage + + `Saxy.stream_events/2` takes a `File.Stream` or `Stream` as the input, so the amount of bytes to buffer in each + chunk can be controlled by `File.stream!/3` API. + + During parsing, the actual memory used by Saxy might be higher than the number configured for each chunk, since + Saxy holds in memory some parsed parts of the original binary to leverage Erlang sub-binary extracting. Anyway, + Saxy tries to free those up when it makes sense. + + ### Options + + See the “Shared options” section at the module documentation. + + * `:character_data_max_length` - tells the parser to emit the `:characters` event when its length exceeds the specified + number. The option is useful when the tag being parsed containing a very large chunk of data. Defaults to `:infinity`. + """ + @spec stream_events(in_stream :: Enumerable.t(), options :: Keyword.t()) :: out_stream :: Enumerable.t() + def stream_events(stream, options \\ []) do + expand_entity = Keyword.get(options, :expand_entity, :keep) + character_data_max_length = Keyword.get(options, :character_data_max_length, :infinity) + cdata_as_characters = Keyword.get(options, :cdata_as_characters, true) + + state = %State{ + prolog: nil, + handler: Accumulating, + user_state: [], + expand_entity: expand_entity, + cdata_as_characters: cdata_as_characters, + character_data_max_length: character_data_max_length + } + + init = {&Parser.Stream.parse_prolog(&1, &2, &1, 0, &3), state} + + stream + |> Stream.concat([:end_of_stream]) + |> Stream.transform(init, &transform_stream/2) + end + + defp transform_stream(:end_of_stream, {cont_fun, state}) do + do_transform_stream(<<>>, false, cont_fun, state) + end + + defp transform_stream(buffer, {cont_fun, state}) do + do_transform_stream(buffer, true, cont_fun, state) + end + + defp do_transform_stream(buffer, more?, cont_fun, state) do + case cont_fun.(buffer, more?, state) do + {:halted, cont_fun, %{user_state: user_state} = state} -> + {:lists.reverse(user_state), {cont_fun, %{state | user_state: []}}} + + {:error, error} -> + raise error + + other -> + {:halt, other} end end diff --git a/lib/saxy/handler/accumulating.ex b/lib/saxy/handler/accumulating.ex new file mode 100644 index 0000000..cba6978 --- /dev/null +++ b/lib/saxy/handler/accumulating.ex @@ -0,0 +1,11 @@ +defmodule Saxy.Handler.Accumulating do + # Accumulating handler originally intended to be + # used with stream transformations + @moduledoc false + + @behaviour Saxy.Handler + + def handle_event(event, data, state) do + {:ok, [{event, data} | state]} + end +end diff --git a/mix.lock b/mix.lock index 8e457c4..050791e 100644 --- a/mix.lock +++ b/mix.lock @@ -1,10 +1,10 @@ %{ "earmark": {:hex, :earmark, "1.2.5", "4d21980d5d2862a2e13ec3c49ad9ad783ffc7ca5769cf6ff891a4553fbaae761", [:mix], [], "hexpm", "c57508ddad47dfb8038ca6de1e616e66e9b87313220ac5d9817bc4a4dc2257b9"}, - "earmark_parser": {:hex, :earmark_parser, "1.4.13", "0c98163e7d04a15feb62000e1a891489feb29f3d10cb57d4f845c405852bbef8", [:mix], [], "hexpm", "d602c26af3a0af43d2f2645613f65841657ad6efc9f0e361c3b6c06b578214ba"}, - "ex_doc": {:hex, :ex_doc, "0.24.2", "e4c26603830c1a2286dae45f4412a4d1980e1e89dc779fcd0181ed1d5a05c8d9", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "e134e1d9e821b8d9e4244687fb2ace58d479b67b282de5158333b0d57c6fb7da"}, - "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, - "makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"}, + "earmark_parser": {:hex, :earmark_parser, "1.4.31", "a93921cdc6b9b869f519213d5bc79d9e218ba768d7270d46fdcf1c01bacff9e2", [:mix], [], "hexpm", "317d367ee0335ef037a87e46c91a2269fef6306413f731e8ec11fc45a7efd059"}, + "ex_doc": {:hex, :ex_doc, "0.29.4", "6257ecbb20c7396b1fe5accd55b7b0d23f44b6aa18017b415cb4c2b91d997729", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "2c6699a737ae46cb61e4ed012af931b57b699643b24dabe2400a8168414bc4f5"}, + "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"}, + "makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"}, "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, - "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, + "nimble_parsec": {:hex, :nimble_parsec, "1.3.0", "9e18a119d9efc3370a3ef2a937bf0b24c088d9c4bf0ba9d7c3751d49d347d035", [:mix], [], "hexpm", "7977f183127a7cbe9346981e2f480dc04c55ffddaef746bd58debd566070eef8"}, "stream_data": {:hex, :stream_data, "0.5.0", "b27641e58941685c75b353577dc602c9d2c12292dd84babf506c2033cd97893e", [:mix], [], "hexpm", "012bd2eec069ada4db3411f9115ccafa38540a3c78c4c0349f151fc761b9e271"}, } diff --git a/test/saxy_test.exs b/test/saxy_test.exs index c5da6c5..e24d1d6 100644 --- a/test/saxy_test.exs +++ b/test/saxy_test.exs @@ -27,6 +27,18 @@ defmodule SaxyTest do end end + test "maps file streams" do + for fixture <- @fixtures do + stream = stream_fixture(fixture) + element_stream = Saxy.stream_events(stream) + assert [_ | _] = Enum.to_list element_stream + end + + assert_raise Saxy.ParseError, fn -> + Enum.to_list Saxy.stream_events stream_fixture "incorrect.xml" + end + end + test "parse_string/4 parses XML binary with multiple \":expand_entity\" strategy" do data = "Something &unknown;" diff --git a/test/support/fixture/incorrect.xml b/test/support/fixture/incorrect.xml new file mode 100644 index 0000000..9b7b48d --- /dev/null +++ b/test/support/fixture/incorrect.xml @@ -0,0 +1,8 @@ + + + + Belgian Waffles + $5.95 + Two of our famous Belgian Waffles with plenty of real maple syrup + 650 +