Skip to content

Commit

Permalink
Use byte length instead of grapheme count for embedding text
Browse files Browse the repository at this point in the history
  • Loading branch information
mbklein committed Aug 16, 2024
1 parent 0e59a33 commit 40a20f0
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion app/lib/meadow/indexing/v2/work.ex
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ defmodule Meadow.Indexing.V2.Work do
|> Enum.join("\n")

Map.put(map, :embedding_text_length, String.length(value))
|> Map.put(:embedding_text, String.slice(value, 0, 2047))
|> Map.put(:embedding_text, truncate(value, 2048))
end

defp concatenate(v) when is_list(v), do: Enum.join(v, "; ")
Expand Down Expand Up @@ -230,4 +230,21 @@ defmodule Meadow.Indexing.V2.Work do
file_set -> FileSets.aspect_ratio(file_set)
end
end

def truncate(string, byte_limit) do
graphemes = String.graphemes(string)

{result, _} =
Enum.reduce_while(graphemes, {"", 0}, fn grapheme, {acc, acc_byte_size} ->
grapheme_byte_size = byte_size(grapheme)

if acc_byte_size + grapheme_byte_size > byte_limit do
{:halt, {acc, acc_byte_size}}
else
{:cont, {acc <> grapheme, acc_byte_size + grapheme_byte_size}}
end
end)

result
end
end

0 comments on commit 40a20f0

Please sign in to comment.