Skip to content

Commit

Permalink
Merge pull request #104 from alphagov/content-size
Browse files Browse the repository at this point in the history
Truncate content above Discovery Engine limit
  • Loading branch information
csutter authored Nov 6, 2023
2 parents 2f3135f + 119c65c commit 9493b0f
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 4 deletions.
13 changes: 9 additions & 4 deletions app/models/concerns/publishing_api/content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module Content
$.details.more_information
].map { JsonPath.new(_1, use_symbols: true) }.freeze
INDEXABLE_CONTENT_SEPARATOR = "\n".freeze
INDEXABLE_CONTENT_MAX_BYTE_SIZE = 950.kilobytes

# Extracts a single string of indexable unstructured content from the document.
def content
Expand All @@ -24,10 +25,14 @@ def content
["<h1>#{_1[:title]}</h1>", ContentWithMultipleTypes.new(_1[:body]).html_content]
end

[
*values_from_json_paths,
*values_from_parts,
].flatten.join(INDEXABLE_CONTENT_SEPARATOR)
[*values_from_json_paths, *values_from_parts]
.flatten
.join(INDEXABLE_CONTENT_SEPARATOR)
# Only take the first INDEXABLE_CONTENT_MAX_BYTE_SIZE bytes of the string
.byteslice(0, INDEXABLE_CONTENT_MAX_BYTE_SIZE)
# Remove any trailing invalid UTF-8 characters that might have been introduced through
# slicing the string
.scrub("")
end
end
end
14 changes: 14 additions & 0 deletions spec/models/concerns/publishing_api/content_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,20 @@
it { is_expected.to eq("<h1>Foo</h1>\nbar\n<h1>Bar</h1>\n<blink>baz</blink>") }
end

describe "with excessively large content" do
let(:document_hash) do
{
details: {
body: "a" * 1.1.megabytes,
},
}
end

it "truncates the content" do
expect(extracted_content.bytesize).to be <= 1.megabyte
end
end

describe "without any fields" do
let(:document_hash) do
{
Expand Down

0 comments on commit 9493b0f

Please sign in to comment.