Skip to content

Commit

Permalink
Refactor internal time zone database (#10572)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #10572

Refactoring the internal time zone database to prepare for the next
PR. Moving map from ID to string to vector, to remove the map access
for the hot path. Also re-organizing the code to allow them to
reference each other.

Reviewed By: mbasmanova

Differential Revision: D60211961
  • Loading branch information
pedroerp authored and facebook-github-bot committed Jul 25, 2024
1 parent 0fe4db8 commit b57bc0f
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 29 deletions.
8 changes: 3 additions & 5 deletions velox/type/tz/TimeZoneDatabase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@

namespace facebook::velox::tz {

const std::unordered_map<int64_t, std::string>& getTimeZoneDB() {
static auto* tzDB = new std::unordered_map<int64_t, std::string>([] {
const std::vector<std::pair<int16_t, std::string>>& getTimeZoneEntries() {
static auto* tzDB = new std::vector<std::pair<int16_t, std::string>>([] {
// Work around clang compiler bug causing multi-hour compilation
// with -fsanitize=fuzzer
// https://github.com/llvm/llvm-project/issues/75666
std::vector<std::pair<int64_t, std::string>> entries = {
return std::vector<std::pair<int16_t, std::string>>{
{0, "+00:00"},
{1, "-14:00"},
{2, "-13:59"},
Expand Down Expand Up @@ -2266,8 +2266,6 @@ const std::unordered_map<int64_t, std::string>& getTimeZoneDB() {
{2232, "Europe/Kyiv"},
{2233, "America/Ciudad_Juarez"},
};
return std::unordered_map<int64_t, std::string>(
entries.begin(), entries.end());
}());
return *tzDB;
}
Expand Down
80 changes: 61 additions & 19 deletions velox/type/tz/TimeZoneMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,56 @@
namespace facebook::velox::tz {

// Defined in TimeZoneDatabase.cpp
extern const std::unordered_map<int64_t, std::string>& getTimeZoneDB();
extern const std::vector<std::pair<int16_t, std::string>>& getTimeZoneEntries();

// TODO: The string will be moved to TimeZone in the next PR.
using TTimeZoneDatabase = std::vector<std::unique_ptr<std::string>>;
using TTimeZoneIndex = folly::F14FastMap<std::string, int16_t>;

namespace {

folly::F14FastMap<std::string, int64_t> makeReverseMap(
const std::unordered_map<int64_t, std::string>& map) {
folly::F14FastMap<std::string, int64_t> reversed;
reversed.reserve(map.size() + 1);
// Flattens the input vector of pairs into a vector, assuming that the
// timezoneIDs are (mostly) sequential. Note that since they are "mostly"
// senquential, the vector can have holes. But it is still more efficient than
// looking up on a map.
TTimeZoneDatabase buildTimeZoneDatabase(
const std::vector<std::pair<int16_t, std::string>>& dbInput) {
TTimeZoneDatabase tzDatabase;
tzDatabase.resize(dbInput.back().first + 1);

for (const auto& entry : dbInput) {
tzDatabase[entry.first] = std::make_unique<std::string>(entry.second);
}
return tzDatabase;
}

const TTimeZoneDatabase& getTimeZoneDatabase() {
static TTimeZoneDatabase timeZoneDatabase =
buildTimeZoneDatabase(getTimeZoneEntries());
return timeZoneDatabase;
}

// Reverses the vector of pairs into a map key'ed by the timezone name for
// reverse look ups.
TTimeZoneIndex buildTimeZoneIndex(const TTimeZoneDatabase& tzDatabase) {
TTimeZoneIndex reversed;
reversed.reserve(tzDatabase.size() + 1);

for (const auto& entry : map) {
reversed.emplace(
boost::algorithm::to_lower_copy(entry.second), entry.first);
for (int16_t i = 0; i < tzDatabase.size(); ++i) {
if (tzDatabase[i] != nullptr) {
reversed.emplace(boost::algorithm::to_lower_copy(*tzDatabase[i]), i);
}
}
reversed.emplace("utc", 0);
return reversed;
}

const TTimeZoneIndex& getTimeZoneIndex() {
static TTimeZoneIndex timeZoneIndex =
buildTimeZoneIndex(getTimeZoneDatabase());
return timeZoneIndex;
}

inline bool isDigit(char c) {
return c >= '0' && c <= '9';
}
Expand Down Expand Up @@ -111,28 +144,37 @@ std::string normalizeTimeZone(const std::string& originalZoneId) {
} // namespace

std::string getTimeZoneName(int64_t timeZoneID) {
const auto& tzDB = getTimeZoneDB();
auto it = tzDB.find(timeZoneID);
VELOX_CHECK(
it != tzDB.end(), "Unable to resolve timeZoneID '{}'", timeZoneID);
return it->second;
const auto& timeZoneDatabase = getTimeZoneDatabase();

VELOX_CHECK_LT(
timeZoneID,
timeZoneDatabase.size(),
"Unable to resolve timeZoneID '{}'",
timeZoneID);

// Check if timeZoneID is not one of the "holes".
VELOX_CHECK_NOT_NULL(
timeZoneDatabase[timeZoneID],
"Unable to resolve timeZoneID '{}'",
timeZoneID);
return *timeZoneDatabase[timeZoneID];
}

int16_t getTimeZoneID(std::string_view timeZone, bool failOnError) {
static folly::F14FastMap<std::string, int64_t> nameToIdMap =
makeReverseMap(getTimeZoneDB());
const auto& timeZoneIndex = getTimeZoneIndex();

std::string timeZoneLowered;
boost::algorithm::to_lower_copy(
std::back_inserter(timeZoneLowered), timeZone);

auto it = nameToIdMap.find(timeZoneLowered);
if (it != nameToIdMap.end()) {
auto it = timeZoneIndex.find(timeZoneLowered);
if (it != timeZoneIndex.end()) {
return it->second;
}

// If an exact match wasn't found, try to normalize the timezone name.
it = nameToIdMap.find(normalizeTimeZone(timeZoneLowered));
if (it != nameToIdMap.end()) {
it = timeZoneIndex.find(normalizeTimeZone(timeZoneLowered));
if (it != timeZoneIndex.end()) {
return it->second;
}
if (failOnError) {
Expand Down
8 changes: 3 additions & 5 deletions velox/type/tz/gen_timezone_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,14 @@
namespace facebook::velox::util {
const std::unordered_map<int64_t, std::string>& getTimeZoneDB() {
static auto* tzDB = new std::unordered_map<int64_t, std::string>([] {
const std::vector<std::pair<int16_t, std::string>>& getTimeZoneEntries() {
static auto* tzDB = new std::vector<std::pair<int16_t, std::string>>([] {
// Work around clang compiler bug causing multi-hour compilation
// with -fsanitize=fuzzer
// https://github.com/llvm/llvm-project/issues/75666
std::vector<std::pair<int64_t, std::string>> entries = {
return std::vector<std::pair<int16_t, std::string>>{
$entries
};
return std::unordered_map<int64_t, std::string>(
entries.begin(), entries.end());
}());
return *tzDB;
}
Expand Down
1 change: 1 addition & 0 deletions velox/type/tz/tests/TimeZoneMapTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ TEST(TimeZoneMapTest, getTimeZoneID) {
EXPECT_EQ(0, getTimeZoneID("UTC"));
EXPECT_EQ(0, getTimeZoneID("GMT"));
EXPECT_EQ(0, getTimeZoneID("Z"));
EXPECT_EQ(0, getTimeZoneID("z"));
EXPECT_EQ(0, getTimeZoneID("greenwich"));
EXPECT_EQ(0, getTimeZoneID("ETC/GMT"));
EXPECT_EQ(0, getTimeZoneID("ETC/GMT0"));
Expand Down

0 comments on commit b57bc0f

Please sign in to comment.