From 076ad2734ff0eb146bff347d9f58108731fabb14 Mon Sep 17 00:00:00 2001
From: Christian Sutter <git@doublesignal.com>
Date: Tue, 5 Mar 2024 16:07:52 +0000
Subject: [PATCH] Reintroduce excessive body content truncation

We previously removed this (#129) as the main issue we were seeing was
around metadata limits being exceeded, but we do have a very small
subset of documents that exceed even the 1MB body content limit.

This adds truncation back in (at 999KB) to handle that long tail.
---
 app/models/concerns/publishing_api/content.rb      |  6 ++++++
 .../models/concerns/publishing_api/content_spec.rb | 14 ++++++++++++++
 2 files changed, 20 insertions(+)
diff --git a/app/models/concerns/publishing_api/content.rb b/app/models/concerns/publishing_api/content.rb
index ffc2ec4..9bfb272 100644
--- a/app/models/concerns/publishing_api/content.rb
+++ b/app/models/concerns/publishing_api/content.rb
@@ -52,6 +52,11 @@ module Content
     ].map { JsonPath.new(_1, use_symbols: true) }.freeze
     INDEXABLE_CONTENT_SEPARATOR = "\n".freeze
 
+    # The limit of content length on Discovery Engine API is currently 1MB (not MiB), a small
+    # handful of documents exceed this so we need to truncate the content to a reasonable size.
+    # This is slightly lower than 1 million bytes to allow for some rounding error.
+    INDEXABLE_CONTENT_MAX_BYTE_SIZE = 999_000
+
     # Extracts a single string of indexable unstructured content from the document.
     def content
       values_from_json_paths = INDEXABLE_CONTENT_VALUES_JSON_PATHS.map do |item|
@@ -67,6 +72,7 @@ def content
         .flatten
         .compact_blank
         .join(INDEXABLE_CONTENT_SEPARATOR)
+        .truncate_bytes(INDEXABLE_CONTENT_MAX_BYTE_SIZE)
     end
   end
 end
diff --git a/spec/models/concerns/publishing_api/content_spec.rb b/spec/models/concerns/publishing_api/content_spec.rb
index 694e650..08b2816 100644
--- a/spec/models/concerns/publishing_api/content_spec.rb
+++ b/spec/models/concerns/publishing_api/content_spec.rb
@@ -72,6 +72,20 @@
       it { is_expected.to eq("<h1>Foo</h1>\nbar\n<h1>Bar</h1>\n<blink>baz</blink>") }
     end
 
+    describe "with excessively large content" do
+      let(:document_hash) do
+        {
+          details: {
+            body: "a" * 1200.kilobytes,
+          },
+        }
+      end
+
+      it "truncates the content" do
+        expect(extracted_content.bytesize).to be < 1_000_000
+      end
+    end
+
     describe "without any fields" do
       let(:document_hash) do
         {