Skip to content
This repository has been archived by the owner on Sep 3, 2024. It is now read-only.

Add simple fragmentation scheme #7

Merged
merged 19 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 36 additions & 13 deletions lib/mix/tasks/search.add.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ defmodule Mix.Tasks.Search.Add do
alias Search.HexClient

@moduledoc """
Usage: mix #{Mix.Task.task_name(__MODULE__)} <PACKAGE> [<VERSION>]
Usage: mix #{Mix.Task.task_name(__MODULE__)} <PACKAGE> [version:<VERSION>] [max_size:<MAX_SIZE>]
karol-t-wilk marked this conversation as resolved.
Show resolved Hide resolved

Fetches the documentation for the given package from Hex. Does not embed it yet.

Expand All @@ -17,21 +17,44 @@ defmodule Mix.Tasks.Search.Add do

@impl Mix.Task
def run(args) do
[package | args_tail] = args
[package_name | args_tail] = args

with {:ok, args} <- parse_args(args_tail, []),
version = Keyword.get(args, :version),
fragmentation_opts = Keyword.take(args, [:max_size]),
package_or_release = package_or_release(package_name, version),
{:ok, package} <-
Packages.add_package(package_or_release, fragmentation_opts: fragmentation_opts) do
Mix.shell().info("Package #{package.name}@#{package.version} added.")
else
{:error, err} -> Mix.shell().error("Error: #{err}")
end
end

defp package_or_release(package_name, nil), do: package_name

package_or_release =
case args_tail do
[version] ->
version = Version.parse!(version)
%HexClient.Release{package_name: package, version: version}
defp package_or_release(package_name, version) do
%HexClient.Release{package_name: package_name, version: version}
end

[] ->
package
end
defp parse_args([], acc), do: {:ok, acc}

case Packages.add_package(package_or_release) do
{:ok, package} -> Mix.shell().info("Package #{package.name}@#{package.version} added.")
{:error, err} -> Mix.shell().error("Error: #{err}")
defp parse_args([arg | args_tail], acc) do
case arg do
"version:" <> version ->
case Version.parse(version) do
{:ok, version} -> parse_args(args_tail, Keyword.put(acc, :version, version))
:error -> {:error, ~c(Could not parse the "version" arg value)}
end

"max_size:" <> max_size ->
case Integer.parse(max_size) do
{max_size, ""} -> parse_args(args_tail, Keyword.put(acc, :max_size, max_size))
_ -> {:error, ~c(Could not parse the "max_size" arg value)}
end

_ ->
{:error, ~c(Unknown argument: "#{arg}")}
end
end
end
73 changes: 73 additions & 0 deletions lib/search/fragmentation_scheme.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
defmodule Search.FragmentationScheme do
@doc """
Splits a binary into multiple binaries that satisfy limitations specified by opts.

If possible, splits the text on whitespace to preserve words. If that is impossible, splits text in between graphemes.

Supported options:

* `:max_size` - maximum byte_size of the output binaries. The output binaries may have size less or equal to that
value, which also should guarantee the sequence length after tokenization will be bounded by this value.
"""
def split(text, opts \\ [])
def split("", _opts), do: []

def split(text, opts) when is_binary(text) do
case Keyword.get(opts, :max_size) do
nil -> [text]
max_size -> do_split(text, [], max_size)
end
end

def recombine(chunks), do: Enum.join(chunks)
karol-t-wilk marked this conversation as resolved.
Show resolved Hide resolved

defp do_split("", acc, _max_size), do: Enum.reverse(acc)

defp do_split(text, acc, max_size) do
# capture the next word along with trailing whitespace and leading whitespace, if any
[next_word] =
Regex.run(~r/^([\s]*[^\s]+\s+)[^\s]*/, text, capture: :all_but_first) ||
[text]

word_chunks =
if byte_size(next_word) > max_size do
split_word("", next_word, [], max_size)
else
[next_word]
end

next_text = binary_slice(text, byte_size(next_word)..-1//1)

case {word_chunks, acc} do
{_, []} ->
do_split(next_text, word_chunks, max_size)

{[single_word], [acc_head | acc_tail]} ->
# we can try extending the last word in accumulator
if byte_size(acc_head) + byte_size(single_word) <= max_size do
do_split(next_text, [acc_head <> single_word | acc_tail], max_size)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we are incrementally building the binaries, but ultimately what we want to do is split a binary into adjacent chunks, so it would be more memory-efficient to accumulate chunks as {offset, size} and then produce the corresponding slices at the end.

Also, this could be a recursion using String.next_grapheme/2 to iterate over all graphemes (avoiding Regex.run). One way to do it would be to track the current string, offset, chunk start position and previous word end position. This is a less important optimisation, but it may actually end up simpler, so I am just dropping the idea :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did as advised, and the function is indeed simpler, and I even discovered another edge case to be covered while redoing the it. The function seems rather ugly still though to me, so if you have any further suggestions about it I'd be more than happy to hear them

else
do_split(next_text, [single_word | acc], max_size)
end

_ ->
# the word had to be split into chunks; there is no need to extend the last word in accumulator
do_split(next_text, word_chunks ++ acc, max_size)
end
end

defp split_word("", "", acc, _max_size), do: acc
defp split_word(chunk, "", acc, _max_size), do: [chunk | acc]

defp split_word(current_chunk, word_rest, acc, max_size) do
{next_graph, word_rest} = String.next_grapheme(word_rest)

if byte_size(current_chunk) + byte_size(next_graph) <= max_size do
# we can continue building the current chunk of the word
split_word(current_chunk <> next_graph, word_rest, acc, max_size)
else
# the next grapheme would bring the chunk over the max size, push to accumulator
split_word(next_graph, word_rest, [current_chunk | acc], max_size)
end
end
end
2 changes: 1 addition & 1 deletion lib/search/hex_client.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ defmodule Search.HexClient do

def get_releases(package_name) when is_binary(package_name) do
case get("packages/#{package_name}") do
{:ok, %{status: 200, body: releases}} ->
{:ok, %{status: 200, body: %{releases: releases}}} ->
res =
for %{version: version} <- releases do
%HexClient.Release{
Expand Down
53 changes: 41 additions & 12 deletions lib/search/packages.ex
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
defmodule Search.Packages do
import Ecto.Query, warn: false
alias Search.FragmentationScheme
alias Search.Repo

alias Search.Packages.{Package, DocItem, DocFragment}
Expand All @@ -11,20 +12,27 @@ defmodule Search.Packages do
If given a package name, adds the latest version of the package to the app. If given a `%HexClient.Release{}` adds
the specified release. Does not embed it yet.
"""
def add_package(package_name) when is_binary(package_name) do
def add_package(name_or_release, opts \\ [])

def add_package(package_name, opts) when is_binary(package_name) do
case HexClient.get_releases(package_name) do
{:ok, releases} ->
latest = HexClient.Release.latest(releases)
add_package(latest)
add_package(latest, opts)

err ->
err
end
end

def add_package(%HexClient.Release{package_name: package_name, version: version} = release) do
def add_package(
%HexClient.Release{package_name: package_name, version: version} = release,
opts
) do
version = Version.to_string(version)

fragmentation_opts = Keyword.get(opts, :fragmentation_opts, [])

with {:ok, docs} <- HexClient.get_docs_tarball(release),
{:ok, search_data} <- ExDocParser.extract_search_data(docs) do
Repo.transaction_with(fn ->
Expand All @@ -43,25 +51,46 @@ defmodule Search.Packages do
|> Ecto.Changeset.put_assoc(:doc_items, [])

with {:ok, package} <- Repo.insert_or_update(package),
:ok <- create_items_from_package(package, search_data) do
:ok <- create_items_from_package(package, search_data, fragmentation_opts) do
{:ok, package}
end
end)
end
end

defp create_items_from_package(%Package{} = _package, []), do: :ok
defp create_items_from_package(%Package{} = _package, [], _fragmentation_opts), do: :ok

defp create_items_from_package(%Package{} = package, [search_data_head | search_data_tail]) do
defp create_items_from_package(
%Package{} = package,
[search_data_head | search_data_tail],
fragmentation_opts
) do
%{"doc" => doc, "title" => title, "ref" => ref, "type" => type} = search_data_head

with {:ok, item} <-
create_doc_item(package, %{doc: doc, title: title, ref: ref, type: type}),
{:ok, _fragment} <-
create_doc_fragment(item, %{
text: "# #{title}\n\n#{doc}"
}) do
create_items_from_package(package, search_data_tail)
create_doc_item(package, %{title: title, ref: ref, type: type}),
fragments =
doc
|> FragmentationScheme.split(fragmentation_opts)
|> Enum.with_index(),
{:ok, _fragments} <-
create_doc_fragments_from_binaries(item, fragments, []) do
create_items_from_package(package, search_data_tail, fragmentation_opts)
end
end

defp create_doc_fragments_from_binaries(_doc_item, [], acc), do: {:ok, acc}

defp create_doc_fragments_from_binaries(doc_item, [{text, order} | texts_tail], acc) do
case create_doc_fragment(doc_item, %{
text: text,
order: order
}) do
{:ok, fragment} ->
create_doc_fragments_from_binaries(doc_item, texts_tail, [fragment | acc])

{:error, _} = err ->
err
end
end

Expand Down
6 changes: 4 additions & 2 deletions lib/search/packages/doc_fragment.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ defmodule Search.Packages.DocFragment do

schema "doc_fragments" do
field :text, :string
field :order, :integer

belongs_to :doc_item, Packages.DocItem

timestamps(type: :utc_datetime)
Expand All @@ -13,8 +15,8 @@ defmodule Search.Packages.DocFragment do
@doc false
def changeset(doc_fragment, attrs) do
doc_fragment
|> cast(attrs, [:text])
|> cast(attrs, [:text, :order])
|> cast_assoc(:doc_item)
|> validate_required([:text])
|> validate_required([:text, :order])
end
end
3 changes: 1 addition & 2 deletions lib/search/packages/doc_item.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ defmodule Search.Packages.DocItem do
field :type, :string
field :title, :string
field :ref, :string
field :doc, :string
belongs_to :package, Packages.Package
has_many :doc_fragments, Packages.DocFragment, on_replace: :delete

Expand All @@ -17,7 +16,7 @@ defmodule Search.Packages.DocItem do
@doc false
def changeset(doc_item, attrs) do
doc_item
|> cast(attrs, [:ref, :type, :title, :doc])
|> cast(attrs, [:ref, :type, :title])
|> cast_assoc(:package)
|> cast_assoc(:doc_fragments)
|> validate_required([:ref, :type, :title])
Expand Down
10 changes: 10 additions & 0 deletions lib/search_web/controllers/page_controller.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ defmodule SearchWeb.PageController do
Search.Embeddings.knn_query(embedding_model, query_tensor, k: k)
|> Stream.map(& &1.doc_fragment.doc_item)
|> Enum.uniq_by(& &1.id)
|> Search.Repo.preload(:doc_fragments)
|> Stream.map(fn item ->
doc_content =
item.doc_fragments
|> Enum.sort_by(& &1.order)
|> Enum.map(& &1.text)
|> Search.FragmentationScheme.recombine()

{item, doc_content}
end)

render(conn, :search, items: items)
else
Expand Down
6 changes: 2 additions & 4 deletions lib/search_web/controllers/page_html/search.html.heex
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
<div :for={item <- @items} class="bg-gray-100 p-4 m-4 rounded">
<div :for={{item, doc_content} <- @items} class="bg-gray-100 p-4 m-4 rounded">
<p class="text-lg font-bold"><%= item.title %></p>
<%= if item.doc do %>
<%= raw(Earmark.as_html!(item.doc)) %>
<% end %>
<%= raw(Earmark.as_html!(doc_content)) %>
</div>
2 changes: 1 addition & 1 deletion priv/repo/migrations/20240411191321_create_schema.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ defmodule Search.Repo.Migrations.CreateSchema do
add :ref, :string, null: false
add :type, :string, null: false
add :title, :string, null: false
add :doc, :text
add :package_id, references("packages", on_delete: :delete_all), null: false

timestamps(type: :utc_datetime)
end

create table(:doc_fragments) do
add :text, :text, null: false
add :order, :integer, null: false
add :doc_item_id, references("doc_items", on_delete: :delete_all), null: false

timestamps(type: :utc_datetime)
Expand Down
Loading
Loading