From 076ad2734ff0eb146bff347d9f58108731fabb14 Mon Sep 17 00:00:00 2001 From: Christian Sutter Date: Tue, 5 Mar 2024 16:07:52 +0000 Subject: [PATCH] Reintroduce excessive body content truncation We previously removed this (#129) as the main issue we were seeing was around metadata limits being exceeded, but we do have a very small subset of documents that exceed even the 1MB body content limit. This adds truncation back in (at 999KB) to handle that long tail. --- app/models/concerns/publishing_api/content.rb | 6 ++++++ .../models/concerns/publishing_api/content_spec.rb | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/app/models/concerns/publishing_api/content.rb b/app/models/concerns/publishing_api/content.rb index ffc2ec4..9bfb272 100644 --- a/app/models/concerns/publishing_api/content.rb +++ b/app/models/concerns/publishing_api/content.rb @@ -52,6 +52,11 @@ module Content ].map { JsonPath.new(_1, use_symbols: true) }.freeze INDEXABLE_CONTENT_SEPARATOR = "\n".freeze + # The limit of content length on Discovery Engine API is currently 1MB (not MiB), a small + # handful of documents exceed this so we need to truncate the content to a reasonable size. + # This is slightly lower than 1 million bytes to allow for some rounding error. + INDEXABLE_CONTENT_MAX_BYTE_SIZE = 999_000 + # Extracts a single string of indexable unstructured content from the document. def content values_from_json_paths = INDEXABLE_CONTENT_VALUES_JSON_PATHS.map do |item| @@ -67,6 +72,7 @@ def content .flatten .compact_blank .join(INDEXABLE_CONTENT_SEPARATOR) + .truncate_bytes(INDEXABLE_CONTENT_MAX_BYTE_SIZE) end end end diff --git a/spec/models/concerns/publishing_api/content_spec.rb b/spec/models/concerns/publishing_api/content_spec.rb index 694e650..08b2816 100644 --- a/spec/models/concerns/publishing_api/content_spec.rb +++ b/spec/models/concerns/publishing_api/content_spec.rb @@ -72,6 +72,20 @@ it { is_expected.to eq("

Foo

\nbar\n

Bar

\nbaz") } end + describe "with excessively large content" do + let(:document_hash) do + { + details: { + body: "a" * 1200.kilobytes, + }, + } + end + + it "truncates the content" do + expect(extracted_content.bytesize).to be < 1_000_000 + end + end + describe "without any fields" do let(:document_hash) do {