From e3e791a881d6fd0fa01a88ab5b53f57c3b70fa63 Mon Sep 17 00:00:00 2001 From: Pedro Eugenio Rocha Pedreira Date: Thu, 25 Jul 2024 17:24:35 -0700 Subject: [PATCH] Refactor internal time zone database (#10572) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10572 Refactoring the internal time zone database to prepare for the next PR. Moving map from ID to string to vector, to remove the map access for the hot path. Also re-organizing the code to allow them to reference each other. Reviewed By: mbasmanova Differential Revision: D60211961 fbshipit-source-id: b532b29f6b3755a357baeb5c9c68650a2f73ecea --- velox/type/tz/TimeZoneDatabase.cpp | 8 +-- velox/type/tz/TimeZoneMap.cpp | 80 +++++++++++++++++++------ velox/type/tz/gen_timezone_database.py | 8 +-- velox/type/tz/tests/TimeZoneMapTest.cpp | 1 + 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/velox/type/tz/TimeZoneDatabase.cpp b/velox/type/tz/TimeZoneDatabase.cpp index 1a31e7d9b118..8a6b47cb1e24 100644 --- a/velox/type/tz/TimeZoneDatabase.cpp +++ b/velox/type/tz/TimeZoneDatabase.cpp @@ -30,12 +30,12 @@ namespace facebook::velox::tz { -const std::unordered_map& getTimeZoneDB() { - static auto* tzDB = new std::unordered_map([] { +const std::vector>& getTimeZoneEntries() { + static auto* tzDB = new std::vector>([] { // Work around clang compiler bug causing multi-hour compilation // with -fsanitize=fuzzer // https://github.com/llvm/llvm-project/issues/75666 - std::vector> entries = { + return std::vector>{ {0, "+00:00"}, {1, "-14:00"}, {2, "-13:59"}, @@ -2266,8 +2266,6 @@ const std::unordered_map& getTimeZoneDB() { {2232, "Europe/Kyiv"}, {2233, "America/Ciudad_Juarez"}, }; - return std::unordered_map( - entries.begin(), entries.end()); }()); return *tzDB; } diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index 2a46240ba892..eeb896738ef5 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -26,23 +26,56 @@ namespace facebook::velox::tz { // Defined in TimeZoneDatabase.cpp -extern const std::unordered_map& getTimeZoneDB(); +extern const std::vector>& getTimeZoneEntries(); + +// TODO: The string will be moved to TimeZone in the next PR. +using TTimeZoneDatabase = std::vector>; +using TTimeZoneIndex = folly::F14FastMap; namespace { -folly::F14FastMap makeReverseMap( - const std::unordered_map& map) { - folly::F14FastMap reversed; - reversed.reserve(map.size() + 1); +// Flattens the input vector of pairs into a vector, assuming that the +// timezoneIDs are (mostly) sequential. Note that since they are "mostly" +// senquential, the vector can have holes. But it is still more efficient than +// looking up on a map. +TTimeZoneDatabase buildTimeZoneDatabase( + const std::vector>& dbInput) { + TTimeZoneDatabase tzDatabase; + tzDatabase.resize(dbInput.back().first + 1); + + for (const auto& entry : dbInput) { + tzDatabase[entry.first] = std::make_unique(entry.second); + } + return tzDatabase; +} + +const TTimeZoneDatabase& getTimeZoneDatabase() { + static TTimeZoneDatabase timeZoneDatabase = + buildTimeZoneDatabase(getTimeZoneEntries()); + return timeZoneDatabase; +} + +// Reverses the vector of pairs into a map key'ed by the timezone name for +// reverse look ups. +TTimeZoneIndex buildTimeZoneIndex(const TTimeZoneDatabase& tzDatabase) { + TTimeZoneIndex reversed; + reversed.reserve(tzDatabase.size() + 1); - for (const auto& entry : map) { - reversed.emplace( - boost::algorithm::to_lower_copy(entry.second), entry.first); + for (int16_t i = 0; i < tzDatabase.size(); ++i) { + if (tzDatabase[i] != nullptr) { + reversed.emplace(boost::algorithm::to_lower_copy(*tzDatabase[i]), i); + } } reversed.emplace("utc", 0); return reversed; } +const TTimeZoneIndex& getTimeZoneIndex() { + static TTimeZoneIndex timeZoneIndex = + buildTimeZoneIndex(getTimeZoneDatabase()); + return timeZoneIndex; +} + inline bool isDigit(char c) { return c >= '0' && c <= '9'; } @@ -111,28 +144,37 @@ std::string normalizeTimeZone(const std::string& originalZoneId) { } // namespace std::string getTimeZoneName(int64_t timeZoneID) { - const auto& tzDB = getTimeZoneDB(); - auto it = tzDB.find(timeZoneID); - VELOX_CHECK( - it != tzDB.end(), "Unable to resolve timeZoneID '{}'", timeZoneID); - return it->second; + const auto& timeZoneDatabase = getTimeZoneDatabase(); + + VELOX_CHECK_LT( + timeZoneID, + timeZoneDatabase.size(), + "Unable to resolve timeZoneID '{}'", + timeZoneID); + + // Check if timeZoneID is not one of the "holes". + VELOX_CHECK_NOT_NULL( + timeZoneDatabase[timeZoneID], + "Unable to resolve timeZoneID '{}'", + timeZoneID); + return *timeZoneDatabase[timeZoneID]; } int16_t getTimeZoneID(std::string_view timeZone, bool failOnError) { - static folly::F14FastMap nameToIdMap = - makeReverseMap(getTimeZoneDB()); + const auto& timeZoneIndex = getTimeZoneIndex(); + std::string timeZoneLowered; boost::algorithm::to_lower_copy( std::back_inserter(timeZoneLowered), timeZone); - auto it = nameToIdMap.find(timeZoneLowered); - if (it != nameToIdMap.end()) { + auto it = timeZoneIndex.find(timeZoneLowered); + if (it != timeZoneIndex.end()) { return it->second; } // If an exact match wasn't found, try to normalize the timezone name. - it = nameToIdMap.find(normalizeTimeZone(timeZoneLowered)); - if (it != nameToIdMap.end()) { + it = timeZoneIndex.find(normalizeTimeZone(timeZoneLowered)); + if (it != timeZoneIndex.end()) { return it->second; } if (failOnError) { diff --git a/velox/type/tz/gen_timezone_database.py b/velox/type/tz/gen_timezone_database.py index 78e84753f3c4..68d53ef1ad9e 100755 --- a/velox/type/tz/gen_timezone_database.py +++ b/velox/type/tz/gen_timezone_database.py @@ -52,16 +52,14 @@ namespace facebook::velox::util { -const std::unordered_map& getTimeZoneDB() { - static auto* tzDB = new std::unordered_map([] { +const std::vector>& getTimeZoneEntries() { + static auto* tzDB = new std::vector>([] { // Work around clang compiler bug causing multi-hour compilation // with -fsanitize=fuzzer // https://github.com/llvm/llvm-project/issues/75666 - std::vector> entries = { + return std::vector>{ $entries }; - return std::unordered_map( - entries.begin(), entries.end()); }()); return *tzDB; } diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index e6873763bbdf..4e0f7818ec5d 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -41,6 +41,7 @@ TEST(TimeZoneMapTest, getTimeZoneID) { EXPECT_EQ(0, getTimeZoneID("UTC")); EXPECT_EQ(0, getTimeZoneID("GMT")); EXPECT_EQ(0, getTimeZoneID("Z")); + EXPECT_EQ(0, getTimeZoneID("z")); EXPECT_EQ(0, getTimeZoneID("greenwich")); EXPECT_EQ(0, getTimeZoneID("ETC/GMT")); EXPECT_EQ(0, getTimeZoneID("ETC/GMT0"));