diff --git a/app/models/concerns/publishing_api/content.rb b/app/models/concerns/publishing_api/content.rb index ffc2ec4..9bfb272 100644 --- a/app/models/concerns/publishing_api/content.rb +++ b/app/models/concerns/publishing_api/content.rb @@ -52,6 +52,11 @@ module Content ].map { JsonPath.new(_1, use_symbols: true) }.freeze INDEXABLE_CONTENT_SEPARATOR = "\n".freeze + # The limit of content length on Discovery Engine API is currently 1MB (not MiB), a small + # handful of documents exceed this so we need to truncate the content to a reasonable size. + # This is slightly lower than 1 million bytes to allow for some rounding error. + INDEXABLE_CONTENT_MAX_BYTE_SIZE = 999_000 + # Extracts a single string of indexable unstructured content from the document. def content values_from_json_paths = INDEXABLE_CONTENT_VALUES_JSON_PATHS.map do |item| @@ -67,6 +72,7 @@ def content .flatten .compact_blank .join(INDEXABLE_CONTENT_SEPARATOR) + .truncate_bytes(INDEXABLE_CONTENT_MAX_BYTE_SIZE) end end end diff --git a/spec/models/concerns/publishing_api/content_spec.rb b/spec/models/concerns/publishing_api/content_spec.rb index 694e650..08b2816 100644 --- a/spec/models/concerns/publishing_api/content_spec.rb +++ b/spec/models/concerns/publishing_api/content_spec.rb @@ -72,6 +72,20 @@ it { is_expected.to eq("