Skip to content

Commit

Permalink
Add MessageTable.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 663916096
  • Loading branch information
protobuf-github-bot authored and copybara-github committed Aug 21, 2024
1 parent 5cbf13b commit 606a8ce
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/google/protobuf/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ cc_library(
"arenaz_sampler.cc",
"extension_set.cc",
"generated_enum_util.cc",
"generated_message_table.cc",
"generated_message_tctable_lite.cc",
"generated_message_util.cc",
"implicit_weak_message.cc",
Expand Down
18 changes: 18 additions & 0 deletions src/google/protobuf/generated_message_table.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include "google/protobuf/generated_message_table.h"

namespace google {
namespace protobuf {
namespace internal {
namespace v2 {

// Messages without any fields can just point to this special table instead of
// creating their own.
constexpr MessageTable<0> kEmptyMessageTable = {
{/*has_bits_offset*/ 0, /*extension_offset*/ 0, /*field_count*/ 0,
/*oneof_field_count*/ 0, /*split_field_count*/ 0, /*oneof_case_count*/ 0,
/*aux_offset*/ 0}};

} // namespace v2
} // namespace internal
} // namespace protobuf
} // namespace google
191 changes: 191 additions & 0 deletions src/google/protobuf/generated_message_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
#define GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DECL_H__

#include <cstdint>
#include <cstdlib>
#include <limits>

#include "absl/base/attributes.h"
#include "absl/log/absl_check.h"
#include "absl/types/span.h"

namespace google {
namespace protobuf {
Expand Down Expand Up @@ -240,6 +243,194 @@ struct FieldEntry {

static_assert(sizeof(FieldEntry) == sizeof(uint64_t), "");

// Used instead of std::array<T, N> to special-case when N==0 to create an empty
// type. It works more effectively with [[no_unique_address]] in C++20.
template <typename T, size_t N>
struct Array {
T array[N];

absl::Span<T> value() { return absl::MakeSpan(array); }
absl::Span<const T> value() const { return absl::MakeSpan(array); }
};

template <typename T>
struct Array<T, 0> {
absl::Span<T> value() { return absl::Span<T>(); }
absl::Span<const T> value() const { return absl::Span<const T>(); }
};

// Table-driven serialization and ByteSizeLong have different interaction with
// tables compared to table-driven parsing. While the latter walks wire-format
// data and needs to cheaply find corresponding field entry, the former can
// afford to just walk "all" present fields per message. To achieve maximum
// efficiency, a dedicated table structure is used for serialization and
// ByteSizeLong.
//
// Since messages have different shape (#fields, field types, etc.), the message
// table must be generic enough to cover all types of messages. For example,
// --extensions
// --singular, optional, repeated, map, oneof fields
// --split fields (go/pdsplit)
//
// While rare, it also has to cover the following cases:
// --huge number of fields (requires 32bit has_bit_index)
// --huge field numbers (requires 32bit field_number)
// --huge message size (requires 32bit offset)
//
// Generic tables consume bigger space not just in memory but in data cache. To
// achieve minimum cache footprint in common cases, we rely on `AuxEntry` for
// fully descriptive entries while `FieldEntry` is large enough for the common
// cases. Extending this notion, some metadata about messages (MessageTableAux)
// are isolated and often dropped when not needed. The following describes the
// most generic table (MessageTable):
//
// template <size_t kNumFields, bool kHasAuxHdr, size_t kNumAux>
// struct MessageTable {
// MessageTableBase header;
// Array<FieldEntry, kNumFields> field_entries;
// MessageTableAuxImpl<kHasAuxHdr> aux_header;
// Array<AuxEntry, kNumAux> aux_entries;
// };
//
// Note that "field_entries" are laid out in the following way (following the
// way fields are laid out in messages):
//
// --repeated fields / singular / optional fields (non-split, non-oneof)
// --split fields
// --oneof fields
//
// If header.split_field_count > 0, the following needs to happen:
// --Get split struct address from aux_header.split_offset.
// --If the address is same as aux_header.default_split_instance, just skip.
// --Otherwise, go through split field entries to handle present fields.
//
// If header.oneof_field_count > 0, the following needs to happen:
// --Get oneof_case[] from aux_header.oneof_case_offset.
// --Read field number from oneof_case[0].
// --Find a corresponding FieldEntry with the field number.
// --Repeat if there are more oneof cases.
//
// Note that FieldEntry tries to fit all required information into 64bit that
// can support the following, which should be large enough for most messsages.
// Otherwise, it falls back to AuxEntry:
// --hasbit_index up to 256. (8bit)
// --sizeof(Message) up to 64 KiB (16bit offset)
// --field_number up to 2^16
//
// To minimize cache footprints of MessageTable, we rely on
// [[no_unique_address]] attributes in C++20. When the associated type is empty
// it results in zero added bytes, which avoids specializations.
//
// For example, most simple cases where no aux_header is needed, use
// MessageTable<num_fields> the size of which becomes
// sizeof(MessageTableBase) + num_fields x sizeof(FieldEntry).
//
// If a message has split, oneof, etc., needing `aux_header` but not
// necessarily `aux_entry`, use MessageTable<num_fields, true>.
template <bool kHasAuxHdr>
struct MessageTableAuxImpl {
uint32_t oneof_case_offset;
uint32_t split_offset;
void* default_split_instance;
};

template <>
struct MessageTableAuxImpl<false> {};

// Wrapper type for non-empty MessageTableAuxImpl.
using MessageTableAux = MessageTableAuxImpl<true>;

struct AuxEntry {
uint32_t hasbit_index;
uint32_t field_number;
uint32_t offset;
};

struct MessageTableBase {
constexpr MessageTableBase(uint16_t has_bits_offset,
uint16_t extension_offset, uint16_t field_count,
uint16_t oneof_field_count,
uint16_t split_field_count,
uint16_t oneof_case_count, uint32_t aux_offset)
: has_bits_offset(has_bits_offset),
extension_offset(extension_offset),
field_count(field_count),
oneof_field_count(oneof_field_count),
split_field_count(split_field_count),
oneof_case_count(oneof_case_count),
aux_offset(aux_offset) {}

// "field_entry" is immediately after `MessageTableBase` without padding bytes
// whose offset is statically known.
FieldEntry field_entry(size_t idx) const {
ABSL_DCHECK_NE(field_count + oneof_field_count + split_field_count, 0);
return *(reinterpret_cast<const FieldEntry*>(this + 1) + idx);
}
FieldEntry& field_entry(size_t idx) {
ABSL_DCHECK_NE(field_count + oneof_field_count + split_field_count, 0);
return *(reinterpret_cast<FieldEntry*>(this + 1) + idx);
}

// "aux_header" is after "field_entry" whose size is variable. Use cached
// "aux_offset" to locate the field.
const MessageTableAux* aux_header() const {
ABSL_DCHECK_NE(aux_offset, 0u);
return reinterpret_cast<const MessageTableAux*>(PtrAt(this, aux_offset));
}
MessageTableAux* aux_header() {
ABSL_DCHECK_NE(aux_offset, 0u);
return reinterpret_cast<MessageTableAux*>(PtrAt(this, aux_offset));
}

// "aux_entry" is after "aux_header" without padding bytes whose size is
// statically known. Instead of caching, we use "aux_offset". This is
// acceptable because there is no padding bytes between the two.
const AuxEntry* aux_entry(size_t idx) const {
ABSL_DCHECK_NE(aux_offset, 0u);
return reinterpret_cast<const AuxEntry*>(
PtrAt(this, aux_offset + sizeof(MessageTableAux))) +
idx;
}
AuxEntry* aux_entry(size_t idx) {
ABSL_DCHECK_NE(aux_offset, 0u);
return reinterpret_cast<AuxEntry*>(
PtrAt(this, aux_offset + sizeof(MessageTableAux))) +
idx;
}

static uintptr_t PtrAt(const void* ptr, size_t offset) {
return reinterpret_cast<uintptr_t>(ptr) + offset;
}

uint16_t has_bits_offset;
uint16_t extension_offset;

uint16_t field_count;
uint16_t oneof_field_count;
uint16_t split_field_count;

// Could've been moved to MessageTableAux but why don't we make good use of
// otherwise wasted padding bytes?
uint16_t oneof_case_count;
// TODO: consider repurposing `aux_offset` as it's cheap to
// calculate the offset. (add + shift)
uint32_t aux_offset;
};

static_assert(sizeof(MessageTableBase) == 16,
"Must be kept compact for minimum cache footprint.");

template <size_t kNumFields, bool kHasAuxHdr = false, size_t kNumAux = 0>
struct MessageTable {
MessageTableBase header;
ABSL_ATTRIBUTE_NO_UNIQUE_ADDRESS Array<FieldEntry, kNumFields> field_entries;
ABSL_ATTRIBUTE_NO_UNIQUE_ADDRESS MessageTableAuxImpl<kHasAuxHdr> aux_header;
ABSL_ATTRIBUTE_NO_UNIQUE_ADDRESS Array<AuxEntry, kNumAux> aux_entries;
};

// A pre-built table for empty messages without any fields.
extern const MessageTable<0> kEmptyMessageTable;

} // namespace v2
} // namespace internal
} // namespace protobuf
Expand Down
49 changes: 49 additions & 0 deletions src/google/protobuf/generated_message_table_gen_test.cc
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#include "google/protobuf/generated_message_table_gen.h"

#include <cctype>
#include <cstddef>
#include <cstdint>
#include <string>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/algorithm/container.h"
#include "absl/log/absl_check.h"
Expand Down Expand Up @@ -175,6 +177,53 @@ INSTANTIATE_TEST_SUITE_P(
return name;
});

TEST(MessageTableTest, AssertNoPaddingSimpleMessageTable) {
// Between header and field_entries.
EXPECT_EQ(offsetof(MessageTable<1>, field_entries), sizeof(MessageTableBase));
EXPECT_EQ(offsetof(MessageTable<2>, field_entries), sizeof(MessageTableBase));
}

// Evaluates to true if FIELD1 and FIELD2 are adjacent without padding (with
// OFFSET in between).
#define EXPECT_BACK_TO_BACK(TYPE, FIELD1, OFFSET, FIELD2) \
EXPECT_EQ((offsetof(TYPE, FIELD1) + OFFSET), offsetof(TYPE, FIELD2))

TEST(MessageTableTest, AssertNoPaddingMessageTableWithoutAuxEntry) {
// header, field_entries, aux_header should be back to back.
using table1_t = MessageTable<1, true>;
EXPECT_BACK_TO_BACK(table1_t, header, sizeof(MessageTableBase),
field_entries);
EXPECT_BACK_TO_BACK(table1_t, field_entries, sizeof(FieldEntry), aux_header);

using table2_t = MessageTable<2, true>;
EXPECT_BACK_TO_BACK(table2_t, header, sizeof(MessageTableBase),
field_entries);
EXPECT_BACK_TO_BACK(table2_t, field_entries, 2 * sizeof(FieldEntry),
aux_header);
}

TEST(MessageTableTest, AssertNoPaddingMessageTable) {
// header, field_entries, aux_header, aux_entries should be back to back.
// offsetof macro doesn't work with T<a, b>. Alias the type here.
using table1_t = MessageTable<1, true, 1>;
EXPECT_BACK_TO_BACK(table1_t, header, sizeof(MessageTableBase),
field_entries);
EXPECT_BACK_TO_BACK(table1_t, field_entries, sizeof(FieldEntry), aux_header);
EXPECT_BACK_TO_BACK(table1_t, aux_header, sizeof(MessageTableAux),
aux_entries);

using table2_t = MessageTable<2, true, 2>;
EXPECT_BACK_TO_BACK(table2_t, header, sizeof(MessageTableBase),
field_entries);
EXPECT_BACK_TO_BACK(table2_t, field_entries, 2 * sizeof(FieldEntry),
aux_header);
EXPECT_BACK_TO_BACK(table2_t, aux_header, sizeof(MessageTableAux),
aux_entries);
}

#undef EXPECT_BACK_TO_BACK


} // namespace
} // namespace v2
} // namespace internal
Expand Down

0 comments on commit 606a8ce

Please sign in to comment.