From 40a20f06677d591a72c27feda5a458d95281ae4f Mon Sep 17 00:00:00 2001 From: "Michael B. Klein" Date: Fri, 16 Aug 2024 19:41:44 +0000 Subject: [PATCH] Use byte length instead of grapheme count for embedding text --- app/lib/meadow/indexing/v2/work.ex | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/app/lib/meadow/indexing/v2/work.ex b/app/lib/meadow/indexing/v2/work.ex index 6d6f53b07..218ac76c5 100644 --- a/app/lib/meadow/indexing/v2/work.ex +++ b/app/lib/meadow/indexing/v2/work.ex @@ -95,7 +95,7 @@ defmodule Meadow.Indexing.V2.Work do |> Enum.join("\n") Map.put(map, :embedding_text_length, String.length(value)) - |> Map.put(:embedding_text, String.slice(value, 0, 2047)) + |> Map.put(:embedding_text, truncate(value, 2048)) end defp concatenate(v) when is_list(v), do: Enum.join(v, "; ") @@ -230,4 +230,21 @@ defmodule Meadow.Indexing.V2.Work do file_set -> FileSets.aspect_ratio(file_set) end end + + def truncate(string, byte_limit) do + graphemes = String.graphemes(string) + + {result, _} = + Enum.reduce_while(graphemes, {"", 0}, fn grapheme, {acc, acc_byte_size} -> + grapheme_byte_size = byte_size(grapheme) + + if acc_byte_size + grapheme_byte_size > byte_limit do + {:halt, {acc, acc_byte_size}} + else + {:cont, {acc <> grapheme, acc_byte_size + grapheme_byte_size}} + end + end) + + result + end end