diff --git a/src/google/protobuf/BUILD.bazel b/src/google/protobuf/BUILD.bazel index 239e31890f35..165061e7d295 100644 --- a/src/google/protobuf/BUILD.bazel +++ b/src/google/protobuf/BUILD.bazel @@ -488,6 +488,7 @@ cc_library( "arenaz_sampler.cc", "extension_set.cc", "generated_enum_util.cc", + "generated_message_table.cc", "generated_message_tctable_lite.cc", "generated_message_util.cc", "implicit_weak_message.cc", diff --git a/src/google/protobuf/generated_message_table.cc b/src/google/protobuf/generated_message_table.cc new file mode 100644 index 000000000000..90212ebcc961 --- /dev/null +++ b/src/google/protobuf/generated_message_table.cc @@ -0,0 +1,18 @@ +#include "google/protobuf/generated_message_table.h" + +namespace google { +namespace protobuf { +namespace internal { +namespace v2 { + +// Messages without any fields can just point to this special table instead of +// creating their own. +constexpr MessageTable<0> kEmptyMessageTable = { + {/*has_bits_offset*/ 0, /*extension_offset*/ 0, /*field_count*/ 0, + /*oneof_field_count*/ 0, /*split_field_count*/ 0, /*oneof_case_count*/ 0, + /*aux_offset*/ 0}}; + +} // namespace v2 +} // namespace internal +} // namespace protobuf +} // namespace google diff --git a/src/google/protobuf/generated_message_table.h b/src/google/protobuf/generated_message_table.h index 5a0a38014ca7..27881641d8ea 100644 --- a/src/google/protobuf/generated_message_table.h +++ b/src/google/protobuf/generated_message_table.h @@ -2,9 +2,12 @@ #define GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DECL_H__ #include +#include #include +#include "absl/base/attributes.h" #include "absl/log/absl_check.h" +#include "absl/types/span.h" namespace google { namespace protobuf { @@ -240,6 +243,194 @@ struct FieldEntry { static_assert(sizeof(FieldEntry) == sizeof(uint64_t), ""); +// Used instead of std::array to special-case when N==0 to create an empty +// type. It works more effectively with [[no_unique_address]] in C++20. +template +struct Array { + T array[N]; + + absl::Span value() { return absl::MakeSpan(array); } + absl::Span value() const { return absl::MakeSpan(array); } +}; + +template +struct Array { + absl::Span value() { return absl::Span(); } + absl::Span value() const { return absl::Span(); } +}; + +// Table-driven serialization and ByteSizeLong have different interaction with +// tables compared to table-driven parsing. While the latter walks wire-format +// data and needs to cheaply find corresponding field entry, the former can +// afford to just walk "all" present fields per message. To achieve maximum +// efficiency, a dedicated table structure is used for serialization and +// ByteSizeLong. +// +// Since messages have different shape (#fields, field types, etc.), the message +// table must be generic enough to cover all types of messages. For example, +// --extensions +// --singular, optional, repeated, map, oneof fields +// --split fields (go/pdsplit) +// +// While rare, it also has to cover the following cases: +// --huge number of fields (requires 32bit has_bit_index) +// --huge field numbers (requires 32bit field_number) +// --huge message size (requires 32bit offset) +// +// Generic tables consume bigger space not just in memory but in data cache. To +// achieve minimum cache footprint in common cases, we rely on `AuxEntry` for +// fully descriptive entries while `FieldEntry` is large enough for the common +// cases. Extending this notion, some metadata about messages (MessageTableAux) +// are isolated and often dropped when not needed. The following describes the +// most generic table (MessageTable): +// +// template +// struct MessageTable { +// MessageTableBase header; +// Array field_entries; +// MessageTableAuxImpl aux_header; +// Array aux_entries; +// }; +// +// Note that "field_entries" are laid out in the following way (following the +// way fields are laid out in messages): +// +// --repeated fields / singular / optional fields (non-split, non-oneof) +// --split fields +// --oneof fields +// +// If header.split_field_count > 0, the following needs to happen: +// --Get split struct address from aux_header.split_offset. +// --If the address is same as aux_header.default_split_instance, just skip. +// --Otherwise, go through split field entries to handle present fields. +// +// If header.oneof_field_count > 0, the following needs to happen: +// --Get oneof_case[] from aux_header.oneof_case_offset. +// --Read field number from oneof_case[0]. +// --Find a corresponding FieldEntry with the field number. +// --Repeat if there are more oneof cases. +// +// Note that FieldEntry tries to fit all required information into 64bit that +// can support the following, which should be large enough for most messsages. +// Otherwise, it falls back to AuxEntry: +// --hasbit_index up to 256. (8bit) +// --sizeof(Message) up to 64 KiB (16bit offset) +// --field_number up to 2^16 +// +// To minimize cache footprints of MessageTable, we rely on +// [[no_unique_address]] attributes in C++20. When the associated type is empty +// it results in zero added bytes, which avoids specializations. +// +// For example, most simple cases where no aux_header is needed, use +// MessageTable the size of which becomes +// sizeof(MessageTableBase) + num_fields x sizeof(FieldEntry). +// +// If a message has split, oneof, etc., needing `aux_header` but not +// necessarily `aux_entry`, use MessageTable. +template +struct MessageTableAuxImpl { + uint32_t oneof_case_offset; + uint32_t split_offset; + void* default_split_instance; +}; + +template <> +struct MessageTableAuxImpl {}; + +// Wrapper type for non-empty MessageTableAuxImpl. +using MessageTableAux = MessageTableAuxImpl; + +struct AuxEntry { + uint32_t hasbit_index; + uint32_t field_number; + uint32_t offset; +}; + +struct MessageTableBase { + constexpr MessageTableBase(uint16_t has_bits_offset, + uint16_t extension_offset, uint16_t field_count, + uint16_t oneof_field_count, + uint16_t split_field_count, + uint16_t oneof_case_count, uint32_t aux_offset) + : has_bits_offset(has_bits_offset), + extension_offset(extension_offset), + field_count(field_count), + oneof_field_count(oneof_field_count), + split_field_count(split_field_count), + oneof_case_count(oneof_case_count), + aux_offset(aux_offset) {} + + // "field_entry" is immediately after `MessageTableBase` without padding bytes + // whose offset is statically known. + FieldEntry field_entry(size_t idx) const { + ABSL_DCHECK_NE(field_count + oneof_field_count + split_field_count, 0); + return *(reinterpret_cast(this + 1) + idx); + } + FieldEntry& field_entry(size_t idx) { + ABSL_DCHECK_NE(field_count + oneof_field_count + split_field_count, 0); + return *(reinterpret_cast(this + 1) + idx); + } + + // "aux_header" is after "field_entry" whose size is variable. Use cached + // "aux_offset" to locate the field. + const MessageTableAux* aux_header() const { + ABSL_DCHECK_NE(aux_offset, 0u); + return reinterpret_cast(PtrAt(this, aux_offset)); + } + MessageTableAux* aux_header() { + ABSL_DCHECK_NE(aux_offset, 0u); + return reinterpret_cast(PtrAt(this, aux_offset)); + } + + // "aux_entry" is after "aux_header" without padding bytes whose size is + // statically known. Instead of caching, we use "aux_offset". This is + // acceptable because there is no padding bytes between the two. + const AuxEntry* aux_entry(size_t idx) const { + ABSL_DCHECK_NE(aux_offset, 0u); + return reinterpret_cast( + PtrAt(this, aux_offset + sizeof(MessageTableAux))) + + idx; + } + AuxEntry* aux_entry(size_t idx) { + ABSL_DCHECK_NE(aux_offset, 0u); + return reinterpret_cast( + PtrAt(this, aux_offset + sizeof(MessageTableAux))) + + idx; + } + + static uintptr_t PtrAt(const void* ptr, size_t offset) { + return reinterpret_cast(ptr) + offset; + } + + uint16_t has_bits_offset; + uint16_t extension_offset; + + uint16_t field_count; + uint16_t oneof_field_count; + uint16_t split_field_count; + + // Could've been moved to MessageTableAux but why don't we make good use of + // otherwise wasted padding bytes? + uint16_t oneof_case_count; + // TODO: consider repurposing `aux_offset` as it's cheap to + // calculate the offset. (add + shift) + uint32_t aux_offset; +}; + +static_assert(sizeof(MessageTableBase) == 16, + "Must be kept compact for minimum cache footprint."); + +template +struct MessageTable { + MessageTableBase header; + ABSL_ATTRIBUTE_NO_UNIQUE_ADDRESS Array field_entries; + ABSL_ATTRIBUTE_NO_UNIQUE_ADDRESS MessageTableAuxImpl aux_header; + ABSL_ATTRIBUTE_NO_UNIQUE_ADDRESS Array aux_entries; +}; + +// A pre-built table for empty messages without any fields. +extern const MessageTable<0> kEmptyMessageTable; + } // namespace v2 } // namespace internal } // namespace protobuf diff --git a/src/google/protobuf/generated_message_table_gen_test.cc b/src/google/protobuf/generated_message_table_gen_test.cc index 604fb64eab26..ee440230ba55 100644 --- a/src/google/protobuf/generated_message_table_gen_test.cc +++ b/src/google/protobuf/generated_message_table_gen_test.cc @@ -1,9 +1,11 @@ #include "google/protobuf/generated_message_table_gen.h" #include +#include #include #include +#include #include #include "absl/algorithm/container.h" #include "absl/log/absl_check.h" @@ -175,6 +177,53 @@ INSTANTIATE_TEST_SUITE_P( return name; }); +TEST(MessageTableTest, AssertNoPaddingSimpleMessageTable) { + // Between header and field_entries. + EXPECT_EQ(offsetof(MessageTable<1>, field_entries), sizeof(MessageTableBase)); + EXPECT_EQ(offsetof(MessageTable<2>, field_entries), sizeof(MessageTableBase)); +} + +// Evaluates to true if FIELD1 and FIELD2 are adjacent without padding (with +// OFFSET in between). +#define EXPECT_BACK_TO_BACK(TYPE, FIELD1, OFFSET, FIELD2) \ + EXPECT_EQ((offsetof(TYPE, FIELD1) + OFFSET), offsetof(TYPE, FIELD2)) + +TEST(MessageTableTest, AssertNoPaddingMessageTableWithoutAuxEntry) { + // header, field_entries, aux_header should be back to back. + using table1_t = MessageTable<1, true>; + EXPECT_BACK_TO_BACK(table1_t, header, sizeof(MessageTableBase), + field_entries); + EXPECT_BACK_TO_BACK(table1_t, field_entries, sizeof(FieldEntry), aux_header); + + using table2_t = MessageTable<2, true>; + EXPECT_BACK_TO_BACK(table2_t, header, sizeof(MessageTableBase), + field_entries); + EXPECT_BACK_TO_BACK(table2_t, field_entries, 2 * sizeof(FieldEntry), + aux_header); +} + +TEST(MessageTableTest, AssertNoPaddingMessageTable) { + // header, field_entries, aux_header, aux_entries should be back to back. + // offsetof macro doesn't work with T. Alias the type here. + using table1_t = MessageTable<1, true, 1>; + EXPECT_BACK_TO_BACK(table1_t, header, sizeof(MessageTableBase), + field_entries); + EXPECT_BACK_TO_BACK(table1_t, field_entries, sizeof(FieldEntry), aux_header); + EXPECT_BACK_TO_BACK(table1_t, aux_header, sizeof(MessageTableAux), + aux_entries); + + using table2_t = MessageTable<2, true, 2>; + EXPECT_BACK_TO_BACK(table2_t, header, sizeof(MessageTableBase), + field_entries); + EXPECT_BACK_TO_BACK(table2_t, field_entries, 2 * sizeof(FieldEntry), + aux_header); + EXPECT_BACK_TO_BACK(table2_t, aux_header, sizeof(MessageTableAux), + aux_entries); +} + +#undef EXPECT_BACK_TO_BACK + + } // namespace } // namespace v2 } // namespace internal