Skip to content

Commit

Permalink
Saxy.stream_events implemented (#118)
Browse files Browse the repository at this point in the history
  • Loading branch information
hissssst authored May 7, 2023
1 parent 2d35859 commit dce77bf
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 9 deletions.
96 changes: 92 additions & 4 deletions lib/saxy.ex
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,11 @@ defmodule Saxy do
"""

@compile {:inline, do_transform_stream: 4}

alias Saxy.{
Encoder,
Handler.Accumulating,
Parser,
State
}
Expand Down Expand Up @@ -309,10 +312,95 @@ defmodule Saxy do
end

defp reduce_stream(buffer, {cont_fun, state}) do
with {:halted, cont_fun, state} <- cont_fun.(buffer, true, state) do
{:cont, {cont_fun, state}}
else
other -> {:halt, other}
case cont_fun.(buffer, true, state) do
{:halted, cont_fun, state} ->
{:cont, {cont_fun, state}}

other ->
{:halt, other}
end
end

@doc """
Parses XML stream and returns a stream of elements.
This function takes a stream and returns a stream of xml SAX events.
When any parsing error occurs, it raises a `Saxy.ParseError` exception.
## Examples
iex> stream = File.stream!("./test/support/fixture/foo.xml")
iex> Enum.to_list Saxy.stream_events stream
[
start_document: [version: "1.0"],
start_element: {"foo", [{"bar", "value"}]},
end_element: "foo"
]
iex> Enum.to_list Saxy.stream_events ["<foo>unclosed value"]
** (Saxy.ParseError) unexpected end of input, expected token: :chardata
> #### Warning {: .warning }
>
> Input stream is evaluated lazily, therefore some events may be emitted before
> exception is raised
## Memory usage
`Saxy.stream_events/2` takes a `File.Stream` or `Stream` as the input, so the amount of bytes to buffer in each
chunk can be controlled by `File.stream!/3` API.
During parsing, the actual memory used by Saxy might be higher than the number configured for each chunk, since
Saxy holds in memory some parsed parts of the original binary to leverage Erlang sub-binary extracting. Anyway,
Saxy tries to free those up when it makes sense.
### Options
See the “Shared options” section at the module documentation.
* `:character_data_max_length` - tells the parser to emit the `:characters` event when its length exceeds the specified
number. The option is useful when the tag being parsed containing a very large chunk of data. Defaults to `:infinity`.
"""
@spec stream_events(in_stream :: Enumerable.t(), options :: Keyword.t()) :: out_stream :: Enumerable.t()
def stream_events(stream, options \\ []) do
expand_entity = Keyword.get(options, :expand_entity, :keep)
character_data_max_length = Keyword.get(options, :character_data_max_length, :infinity)
cdata_as_characters = Keyword.get(options, :cdata_as_characters, true)

state = %State{
prolog: nil,
handler: Accumulating,
user_state: [],
expand_entity: expand_entity,
cdata_as_characters: cdata_as_characters,
character_data_max_length: character_data_max_length
}

init = {&Parser.Stream.parse_prolog(&1, &2, &1, 0, &3), state}

stream
|> Stream.concat([:end_of_stream])
|> Stream.transform(init, &transform_stream/2)
end

defp transform_stream(:end_of_stream, {cont_fun, state}) do
do_transform_stream(<<>>, false, cont_fun, state)
end

defp transform_stream(buffer, {cont_fun, state}) do
do_transform_stream(buffer, true, cont_fun, state)
end

defp do_transform_stream(buffer, more?, cont_fun, state) do
case cont_fun.(buffer, more?, state) do
{:halted, cont_fun, %{user_state: user_state} = state} ->
{:lists.reverse(user_state), {cont_fun, %{state | user_state: []}}}

{:error, error} ->
raise error

other ->
{:halt, other}
end
end

Expand Down
11 changes: 11 additions & 0 deletions lib/saxy/handler/accumulating.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
defmodule Saxy.Handler.Accumulating do
# Accumulating handler originally intended to be
# used with stream transformations
@moduledoc false

@behaviour Saxy.Handler

def handle_event(event, data, state) do
{:ok, [{event, data} | state]}
end
end
10 changes: 5 additions & 5 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
%{
"earmark": {:hex, :earmark, "1.2.5", "4d21980d5d2862a2e13ec3c49ad9ad783ffc7ca5769cf6ff891a4553fbaae761", [:mix], [], "hexpm", "c57508ddad47dfb8038ca6de1e616e66e9b87313220ac5d9817bc4a4dc2257b9"},
"earmark_parser": {:hex, :earmark_parser, "1.4.13", "0c98163e7d04a15feb62000e1a891489feb29f3d10cb57d4f845c405852bbef8", [:mix], [], "hexpm", "d602c26af3a0af43d2f2645613f65841657ad6efc9f0e361c3b6c06b578214ba"},
"ex_doc": {:hex, :ex_doc, "0.24.2", "e4c26603830c1a2286dae45f4412a4d1980e1e89dc779fcd0181ed1d5a05c8d9", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "e134e1d9e821b8d9e4244687fb2ace58d479b67b282de5158333b0d57c6fb7da"},
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
"makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"},
"earmark_parser": {:hex, :earmark_parser, "1.4.31", "a93921cdc6b9b869f519213d5bc79d9e218ba768d7270d46fdcf1c01bacff9e2", [:mix], [], "hexpm", "317d367ee0335ef037a87e46c91a2269fef6306413f731e8ec11fc45a7efd059"},
"ex_doc": {:hex, :ex_doc, "0.29.4", "6257ecbb20c7396b1fe5accd55b7b0d23f44b6aa18017b415cb4c2b91d997729", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "2c6699a737ae46cb61e4ed012af931b57b699643b24dabe2400a8168414bc4f5"},
"makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
"makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"},
"makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},
"nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
"nimble_parsec": {:hex, :nimble_parsec, "1.3.0", "9e18a119d9efc3370a3ef2a937bf0b24c088d9c4bf0ba9d7c3751d49d347d035", [:mix], [], "hexpm", "7977f183127a7cbe9346981e2f480dc04c55ffddaef746bd58debd566070eef8"},
"stream_data": {:hex, :stream_data, "0.5.0", "b27641e58941685c75b353577dc602c9d2c12292dd84babf506c2033cd97893e", [:mix], [], "hexpm", "012bd2eec069ada4db3411f9115ccafa38540a3c78c4c0349f151fc761b9e271"},
}
12 changes: 12 additions & 0 deletions test/saxy_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ defmodule SaxyTest do
end
end

test "maps file streams" do
for fixture <- @fixtures do
stream = stream_fixture(fixture)
element_stream = Saxy.stream_events(stream)
assert [_ | _] = Enum.to_list element_stream
end

assert_raise Saxy.ParseError, fn ->
Enum.to_list Saxy.stream_events stream_fixture "incorrect.xml"
end
end

test "parse_string/4 parses XML binary with multiple \":expand_entity\" strategy" do
data = "<foo>Something &unknown;</foo>"

Expand Down
8 changes: 8 additions & 0 deletions test/support/fixture/incorrect.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
<food>
<name>Belgian Waffles</name>
<price>$5.95</price>
<description>Two of our famous Belgian Waffles with plenty of real maple syrup</description>
<calories>650</calories>
</food>

0 comments on commit dce77bf

Please sign in to comment.