diff --git a/lib/mix/tasks/search.add.ex b/lib/mix/tasks/search.add.ex index 45b3326..be46e38 100644 --- a/lib/mix/tasks/search.add.ex +++ b/lib/mix/tasks/search.add.ex @@ -3,7 +3,7 @@ defmodule Mix.Tasks.Search.Add do alias Search.HexClient @moduledoc """ - Usage: mix #{Mix.Task.task_name(__MODULE__)} [] + Usage: mix #{Mix.Task.task_name(__MODULE__)} [--version ] [--max-size ] Fetches the documentation for the given package from Hex. Does not embed it yet. @@ -17,21 +17,42 @@ defmodule Mix.Tasks.Search.Add do @impl Mix.Task def run(args) do - [package | args_tail] = args - - package_or_release = - case args_tail do - [version] -> - version = Version.parse!(version) - %HexClient.Release{package_name: package, version: version} + case OptionParser.parse(args, strict: [version: :string, max_size: :integer]) do + {opts, [package_name], []} -> + version = Keyword.get(opts, :version) + fragmentation_opts = Keyword.take(opts, [:max_size]) + + with {:ok, package_or_release} <- package_or_release(package_name, version), + {:ok, package} <- + Packages.add_package(package_or_release, fragmentation_opts: fragmentation_opts) do + Mix.shell().info("Package #{package.name}@#{package.version} added.") + else + {:error, err} -> + Mix.shell().error("Error: #{err}") + end + + {_opts, [], []} -> + Mix.shell().error("Expected a package name as one of the arguments.") + + {_opts, _more_than_one, []} -> + Mix.shell().error("Too many arguments.") + + {_opts, _, invalid} -> + invalid = + invalid + |> Enum.map(&elem(&1, 0)) + |> Enum.join(", ") + + Mix.shell().error("Incorrect or unknown options: #{invalid}") + end + end - [] -> - package - end + defp package_or_release(package_name, nil), do: {:ok, package_name} - case Packages.add_package(package_or_release) do - {:ok, package} -> Mix.shell().info("Package #{package.name}@#{package.version} added.") - {:error, err} -> Mix.shell().error("Error: #{err}") + defp package_or_release(package_name, version) do + case Version.parse(version) do + {:ok, version} -> {:ok, %HexClient.Release{package_name: package_name, version: version}} + :error -> {:error, "Could not parse the requested version."} end end end diff --git a/lib/search/fragmentation_scheme.ex b/lib/search/fragmentation_scheme.ex new file mode 100644 index 0000000..d8ff375 --- /dev/null +++ b/lib/search/fragmentation_scheme.ex @@ -0,0 +1,62 @@ +defmodule Search.FragmentationScheme do + @doc """ + Splits a binary into multiple binaries that satisfy limitations specified by opts. + + If possible, splits the text on whitespace to preserve words. If that is impossible, splits text in between graphemes. + + Supported options: + + * `:max_size` - maximum byte_size of the output binaries. The output binaries may have size less or equal to that + value, which also should guarantee the sequence length after tokenization will be bounded by this value. + """ + def split(text, opts \\ []) + def split("", _opts), do: [] + + def split(text, opts) when is_binary(text) do + case Keyword.get(opts, :max_size) do + nil -> + [text] + + max_size -> + text + |> compute_splits(max_size, 0, nil, []) + |> split_binary(text) + end + end + + @doc """ + Recreates the original text from a list of chunks. + """ + def recombine(chunks), do: Enum.join(chunks) + + defp split_binary([], ""), do: [] + + defp split_binary([split_size | splits_tail], string) do + <> = string + [chunk | split_binary(splits_tail, rest)] + end + + defp compute_splits("", _, size, _, sizes), do: Enum.reverse(sizes, [size]) + + defp compute_splits(string, max_size, size, size_until_word, sizes) do + {grapheme, string} = String.next_grapheme(string) + grapheme_size = byte_size(grapheme) + + if size + grapheme_size > max_size do + if size_until_word do + # Split before the current unfinished word + next = size - size_until_word + compute_splits(string, max_size, next + grapheme_size, nil, [size_until_word | sizes]) + else + # The current chunk has a single word, just split it + compute_splits(string, max_size, grapheme_size, nil, [size | sizes]) + end + else + new_size = size + grapheme_size + size_until_word = if whitespace?(grapheme), do: new_size, else: size_until_word + compute_splits(string, max_size, new_size, size_until_word, sizes) + end + end + + defp whitespace?(grapheme), do: grapheme =~ ~r/\s/ +end diff --git a/lib/search/hex_client.ex b/lib/search/hex_client.ex index 167e7e8..6d793eb 100644 --- a/lib/search/hex_client.ex +++ b/lib/search/hex_client.ex @@ -5,7 +5,7 @@ defmodule Search.HexClient do def get_releases(package_name) when is_binary(package_name) do case get("packages/#{package_name}") do - {:ok, %{status: 200, body: releases}} -> + {:ok, %{status: 200, body: %{releases: releases}}} -> res = for %{version: version} <- releases do %HexClient.Release{ diff --git a/lib/search/packages.ex b/lib/search/packages.ex index 82b60b7..a459ece 100644 --- a/lib/search/packages.ex +++ b/lib/search/packages.ex @@ -1,5 +1,6 @@ defmodule Search.Packages do import Ecto.Query, warn: false + alias Search.FragmentationScheme alias Search.Repo alias Search.Packages.{Package, DocItem, DocFragment} @@ -11,20 +12,27 @@ defmodule Search.Packages do If given a package name, adds the latest version of the package to the app. If given a `%HexClient.Release{}` adds the specified release. Does not embed it yet. """ - def add_package(package_name) when is_binary(package_name) do + def add_package(name_or_release, opts \\ []) + + def add_package(package_name, opts) when is_binary(package_name) do case HexClient.get_releases(package_name) do {:ok, releases} -> latest = HexClient.Release.latest(releases) - add_package(latest) + add_package(latest, opts) err -> err end end - def add_package(%HexClient.Release{package_name: package_name, version: version} = release) do + def add_package( + %HexClient.Release{package_name: package_name, version: version} = release, + opts + ) do version = Version.to_string(version) + fragmentation_opts = Keyword.get(opts, :fragmentation_opts, []) + with {:ok, docs} <- HexClient.get_docs_tarball(release), {:ok, search_data} <- ExDocParser.extract_search_data(docs) do Repo.transaction_with(fn -> @@ -43,25 +51,46 @@ defmodule Search.Packages do |> Ecto.Changeset.put_assoc(:doc_items, []) with {:ok, package} <- Repo.insert_or_update(package), - :ok <- create_items_from_package(package, search_data) do + :ok <- create_items_from_package(package, search_data, fragmentation_opts) do {:ok, package} end end) end end - defp create_items_from_package(%Package{} = _package, []), do: :ok + defp create_items_from_package(%Package{} = _package, [], _fragmentation_opts), do: :ok - defp create_items_from_package(%Package{} = package, [search_data_head | search_data_tail]) do + defp create_items_from_package( + %Package{} = package, + [search_data_head | search_data_tail], + fragmentation_opts + ) do %{"doc" => doc, "title" => title, "ref" => ref, "type" => type} = search_data_head with {:ok, item} <- - create_doc_item(package, %{doc: doc, title: title, ref: ref, type: type}), - {:ok, _fragment} <- - create_doc_fragment(item, %{ - text: "# #{title}\n\n#{doc}" - }) do - create_items_from_package(package, search_data_tail) + create_doc_item(package, %{title: title, ref: ref, type: type}), + fragments = + doc + |> FragmentationScheme.split(fragmentation_opts) + |> Enum.with_index(), + {:ok, _fragments} <- + create_doc_fragments_from_binaries(item, fragments, []) do + create_items_from_package(package, search_data_tail, fragmentation_opts) + end + end + + defp create_doc_fragments_from_binaries(_doc_item, [], acc), do: {:ok, acc} + + defp create_doc_fragments_from_binaries(doc_item, [{text, order} | texts_tail], acc) do + case create_doc_fragment(doc_item, %{ + text: text, + order: order + }) do + {:ok, fragment} -> + create_doc_fragments_from_binaries(doc_item, texts_tail, [fragment | acc]) + + {:error, _} = err -> + err end end diff --git a/lib/search/packages/doc_fragment.ex b/lib/search/packages/doc_fragment.ex index 77a116a..1ff9d8d 100644 --- a/lib/search/packages/doc_fragment.ex +++ b/lib/search/packages/doc_fragment.ex @@ -5,6 +5,8 @@ defmodule Search.Packages.DocFragment do schema "doc_fragments" do field :text, :string + field :order, :integer + belongs_to :doc_item, Packages.DocItem timestamps(type: :utc_datetime) @@ -13,8 +15,8 @@ defmodule Search.Packages.DocFragment do @doc false def changeset(doc_fragment, attrs) do doc_fragment - |> cast(attrs, [:text]) + |> cast(attrs, [:text, :order]) |> cast_assoc(:doc_item) - |> validate_required([:text]) + |> validate_required([:text, :order]) end end diff --git a/lib/search/packages/doc_item.ex b/lib/search/packages/doc_item.ex index 5e97cf7..9cffc7d 100644 --- a/lib/search/packages/doc_item.ex +++ b/lib/search/packages/doc_item.ex @@ -7,7 +7,6 @@ defmodule Search.Packages.DocItem do field :type, :string field :title, :string field :ref, :string - field :doc, :string belongs_to :package, Packages.Package has_many :doc_fragments, Packages.DocFragment, on_replace: :delete @@ -17,7 +16,7 @@ defmodule Search.Packages.DocItem do @doc false def changeset(doc_item, attrs) do doc_item - |> cast(attrs, [:ref, :type, :title, :doc]) + |> cast(attrs, [:ref, :type, :title]) |> cast_assoc(:package) |> cast_assoc(:doc_fragments) |> validate_required([:ref, :type, :title]) diff --git a/lib/search_web/controllers/page_controller.ex b/lib/search_web/controllers/page_controller.ex index 86303ab..a3e56a5 100644 --- a/lib/search_web/controllers/page_controller.ex +++ b/lib/search_web/controllers/page_controller.ex @@ -38,6 +38,16 @@ defmodule SearchWeb.PageController do Search.Embeddings.knn_query(embedding_model, query_tensor, k: k) |> Stream.map(& &1.doc_fragment.doc_item) |> Enum.uniq_by(& &1.id) + |> Search.Repo.preload(:doc_fragments) + |> Stream.map(fn item -> + doc_content = + item.doc_fragments + |> Enum.sort_by(& &1.order) + |> Enum.map(& &1.text) + |> Search.FragmentationScheme.recombine() + + {item, doc_content} + end) render(conn, :search, items: items) else diff --git a/lib/search_web/controllers/page_html/search.html.heex b/lib/search_web/controllers/page_html/search.html.heex index 7a79cc5..8d97fc3 100644 --- a/lib/search_web/controllers/page_html/search.html.heex +++ b/lib/search_web/controllers/page_html/search.html.heex @@ -1,6 +1,4 @@ -
+

<%= item.title %>

- <%= if item.doc do %> - <%= raw(Earmark.as_html!(item.doc)) %> - <% end %> + <%= raw(Earmark.as_html!(doc_content)) %>
diff --git a/priv/repo/migrations/20240411191321_create_schema.exs b/priv/repo/migrations/20240411191321_create_schema.exs index 10fcc2b..2aaaadd 100644 --- a/priv/repo/migrations/20240411191321_create_schema.exs +++ b/priv/repo/migrations/20240411191321_create_schema.exs @@ -15,7 +15,6 @@ defmodule Search.Repo.Migrations.CreateSchema do add :ref, :string, null: false add :type, :string, null: false add :title, :string, null: false - add :doc, :text add :package_id, references("packages", on_delete: :delete_all), null: false timestamps(type: :utc_datetime) @@ -23,6 +22,7 @@ defmodule Search.Repo.Migrations.CreateSchema do create table(:doc_fragments) do add :text, :text, null: false + add :order, :integer, null: false add :doc_item_id, references("doc_items", on_delete: :delete_all), null: false timestamps(type: :utc_datetime) diff --git a/test/search/fragmentation_scheme_test.exs b/test/search/fragmentation_scheme_test.exs new file mode 100644 index 0000000..cca3391 --- /dev/null +++ b/test/search/fragmentation_scheme_test.exs @@ -0,0 +1,92 @@ +defmodule Search.FragmentationSchemeTest do + alias Search.FragmentationScheme + use ExUnit.Case, async: true + + describe "split/2" do + test "when given an empty string, returns empty list" do + assert FragmentationScheme.split("") == [] + end + + test "when given a string which satisfies the size constraint, returns a singleton list with that string" do + str = "short string" + + assert FragmentationScheme.split(str, max_size: 100) == [str] + end + + test "when given a string that is too long and splitting along whitespace is possible, splits the string" do + str = "some words and some more words" + + assert FragmentationScheme.split(str, max_size: 15) == [ + "some words and ", + "some more words" + ] + end + + test "when splitting along whitespace, respects non-space whitespace characters" do + str = "word\nword\tword\u{2003}word" + + assert FragmentationScheme.split(str, max_size: 7) == [ + "word\n", + "word\t", + "word\u{2003}", + "word" + ] + end + + test "when splitting along whitespace, if there is too much trailing whitespace, splits in the middle of it" do + str = "word other word" + + assert FragmentationScheme.split(str, max_size: 15) == [ + "word ", + " other word" + ] + end + + test "when splitting along whitespace and the text starts with whitespace, the whitespace characters are prepended to the first fragment" do + str = " words and some more words" + + assert FragmentationScheme.split(str, max_size: 15) == [ + " words and ", + "some more words" + ] + end + + test "when cannot split along whitespace, splits along grapheme boundaries" do + str1 = "asdfghjkl" + + assert FragmentationScheme.split(str1, max_size: 5) == [ + "asdfg", + "hjkl" + ] + + # the "g" has a bunch of diacritics, which means the grapheme is 4 codepoints / 7 bytes long + str2 = "asdfg\u{0300}\u{0322}\u{0342}hjkl" + + assert FragmentationScheme.split(str2, max_size: 7) == [ + "asdf", + "g\u{0300}\u{0322}\u{0342}", + "hjkl" + ] + end + end + + describe "recombine/1" do + test "recreates the original text" do + str = """ + Lorem ipsum dolor sit amet, consectetur adipiscing elit. + + Phasellus convallis libero at lectus vestibulum, sit amet mattis leo tempor. + + Aenean pulvinar purus ac euismod accumsan. + + Cras finibus risus laoreet neque condimentum, nec hendrerit justo blandit. + + Sed vitae orci ut odio pellentesque cursus. + """ + + split = FragmentationScheme.split(str, max_size: 100) + + assert FragmentationScheme.recombine(split) == str + end + end +end diff --git a/test/search/packages_test.exs b/test/search/packages_test.exs index b286e0f..fdd9119 100644 --- a/test/search/packages_test.exs +++ b/test/search/packages_test.exs @@ -12,7 +12,8 @@ defmodule Search.PackagesTest do [item] = doc_items_fixture(1) valid_attrs = %{ - text: "Some text" + text: "Some text", + order: 0 } assert {:ok, %DocFragment{} = fragment} = Packages.create_doc_fragment(item, valid_attrs) @@ -38,14 +39,12 @@ defmodule Search.PackagesTest do valid_attrs = %{ title: "Some title", type: "module", - doc: "Some doc", ref: "Some ref" } assert {:ok, %DocItem{} = item} = Packages.create_doc_item(package, valid_attrs) assert item.title == valid_attrs.title assert item.type == valid_attrs.type - assert item.doc == valid_attrs.doc assert item.ref == valid_attrs.ref item = Repo.preload(item, :package) assert item.package.id == package.id diff --git a/test/support/fixtures/packages_fixtures.ex b/test/support/fixtures/packages_fixtures.ex index c24bf84..1f8c6b1 100644 --- a/test/support/fixtures/packages_fixtures.ex +++ b/test/support/fixtures/packages_fixtures.ex @@ -24,9 +24,8 @@ defmodule Search.PackagesFixtures do for i <- 1..num_items do Search.Repo.insert!(%Search.Packages.DocItem{ - title: "Module doc title", + title: "Module doc title #{i}", ref: "Test ref", - doc: "Text #{i}", type: "module", package: package }) @@ -34,11 +33,13 @@ defmodule Search.PackagesFixtures do end def doc_fragments_fixture(num_fragments) do - items = doc_items_fixture(num_fragments) + items = + doc_items_fixture(num_fragments) for item <- items do Search.Repo.insert!(%Search.Packages.DocFragment{ - text: "Preprocessed text: #{item.doc}", + text: "Preprocessed text for #{item.title}", + order: 0, doc_item: item }) end