diff --git a/velox/type/tz/TimeZoneDatabase.cpp b/velox/type/tz/TimeZoneDatabase.cpp index 1a31e7d9b1185..8a6b47cb1e244 100644 --- a/velox/type/tz/TimeZoneDatabase.cpp +++ b/velox/type/tz/TimeZoneDatabase.cpp @@ -30,12 +30,12 @@ namespace facebook::velox::tz { -const std::unordered_map& getTimeZoneDB() { - static auto* tzDB = new std::unordered_map([] { +const std::vector>& getTimeZoneEntries() { + static auto* tzDB = new std::vector>([] { // Work around clang compiler bug causing multi-hour compilation // with -fsanitize=fuzzer // https://github.com/llvm/llvm-project/issues/75666 - std::vector> entries = { + return std::vector>{ {0, "+00:00"}, {1, "-14:00"}, {2, "-13:59"}, @@ -2266,8 +2266,6 @@ const std::unordered_map& getTimeZoneDB() { {2232, "Europe/Kyiv"}, {2233, "America/Ciudad_Juarez"}, }; - return std::unordered_map( - entries.begin(), entries.end()); }()); return *tzDB; } diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp index 2a46240ba8927..eeb896738ef5c 100644 --- a/velox/type/tz/TimeZoneMap.cpp +++ b/velox/type/tz/TimeZoneMap.cpp @@ -26,23 +26,56 @@ namespace facebook::velox::tz { // Defined in TimeZoneDatabase.cpp -extern const std::unordered_map& getTimeZoneDB(); +extern const std::vector>& getTimeZoneEntries(); + +// TODO: The string will be moved to TimeZone in the next PR. +using TTimeZoneDatabase = std::vector>; +using TTimeZoneIndex = folly::F14FastMap; namespace { -folly::F14FastMap makeReverseMap( - const std::unordered_map& map) { - folly::F14FastMap reversed; - reversed.reserve(map.size() + 1); +// Flattens the input vector of pairs into a vector, assuming that the +// timezoneIDs are (mostly) sequential. Note that since they are "mostly" +// senquential, the vector can have holes. But it is still more efficient than +// looking up on a map. +TTimeZoneDatabase buildTimeZoneDatabase( + const std::vector>& dbInput) { + TTimeZoneDatabase tzDatabase; + tzDatabase.resize(dbInput.back().first + 1); + + for (const auto& entry : dbInput) { + tzDatabase[entry.first] = std::make_unique(entry.second); + } + return tzDatabase; +} + +const TTimeZoneDatabase& getTimeZoneDatabase() { + static TTimeZoneDatabase timeZoneDatabase = + buildTimeZoneDatabase(getTimeZoneEntries()); + return timeZoneDatabase; +} + +// Reverses the vector of pairs into a map key'ed by the timezone name for +// reverse look ups. +TTimeZoneIndex buildTimeZoneIndex(const TTimeZoneDatabase& tzDatabase) { + TTimeZoneIndex reversed; + reversed.reserve(tzDatabase.size() + 1); - for (const auto& entry : map) { - reversed.emplace( - boost::algorithm::to_lower_copy(entry.second), entry.first); + for (int16_t i = 0; i < tzDatabase.size(); ++i) { + if (tzDatabase[i] != nullptr) { + reversed.emplace(boost::algorithm::to_lower_copy(*tzDatabase[i]), i); + } } reversed.emplace("utc", 0); return reversed; } +const TTimeZoneIndex& getTimeZoneIndex() { + static TTimeZoneIndex timeZoneIndex = + buildTimeZoneIndex(getTimeZoneDatabase()); + return timeZoneIndex; +} + inline bool isDigit(char c) { return c >= '0' && c <= '9'; } @@ -111,28 +144,37 @@ std::string normalizeTimeZone(const std::string& originalZoneId) { } // namespace std::string getTimeZoneName(int64_t timeZoneID) { - const auto& tzDB = getTimeZoneDB(); - auto it = tzDB.find(timeZoneID); - VELOX_CHECK( - it != tzDB.end(), "Unable to resolve timeZoneID '{}'", timeZoneID); - return it->second; + const auto& timeZoneDatabase = getTimeZoneDatabase(); + + VELOX_CHECK_LT( + timeZoneID, + timeZoneDatabase.size(), + "Unable to resolve timeZoneID '{}'", + timeZoneID); + + // Check if timeZoneID is not one of the "holes". + VELOX_CHECK_NOT_NULL( + timeZoneDatabase[timeZoneID], + "Unable to resolve timeZoneID '{}'", + timeZoneID); + return *timeZoneDatabase[timeZoneID]; } int16_t getTimeZoneID(std::string_view timeZone, bool failOnError) { - static folly::F14FastMap nameToIdMap = - makeReverseMap(getTimeZoneDB()); + const auto& timeZoneIndex = getTimeZoneIndex(); + std::string timeZoneLowered; boost::algorithm::to_lower_copy( std::back_inserter(timeZoneLowered), timeZone); - auto it = nameToIdMap.find(timeZoneLowered); - if (it != nameToIdMap.end()) { + auto it = timeZoneIndex.find(timeZoneLowered); + if (it != timeZoneIndex.end()) { return it->second; } // If an exact match wasn't found, try to normalize the timezone name. - it = nameToIdMap.find(normalizeTimeZone(timeZoneLowered)); - if (it != nameToIdMap.end()) { + it = timeZoneIndex.find(normalizeTimeZone(timeZoneLowered)); + if (it != timeZoneIndex.end()) { return it->second; } if (failOnError) { diff --git a/velox/type/tz/gen_timezone_database.py b/velox/type/tz/gen_timezone_database.py index 78e84753f3c4c..68d53ef1ad9ea 100755 --- a/velox/type/tz/gen_timezone_database.py +++ b/velox/type/tz/gen_timezone_database.py @@ -52,16 +52,14 @@ namespace facebook::velox::util { -const std::unordered_map& getTimeZoneDB() { - static auto* tzDB = new std::unordered_map([] { +const std::vector>& getTimeZoneEntries() { + static auto* tzDB = new std::vector>([] { // Work around clang compiler bug causing multi-hour compilation // with -fsanitize=fuzzer // https://github.com/llvm/llvm-project/issues/75666 - std::vector> entries = { + return std::vector>{ $entries }; - return std::unordered_map( - entries.begin(), entries.end()); }()); return *tzDB; } diff --git a/velox/type/tz/tests/TimeZoneMapTest.cpp b/velox/type/tz/tests/TimeZoneMapTest.cpp index e6873763bbdfa..4e0f7818ec5d4 100644 --- a/velox/type/tz/tests/TimeZoneMapTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapTest.cpp @@ -41,6 +41,7 @@ TEST(TimeZoneMapTest, getTimeZoneID) { EXPECT_EQ(0, getTimeZoneID("UTC")); EXPECT_EQ(0, getTimeZoneID("GMT")); EXPECT_EQ(0, getTimeZoneID("Z")); + EXPECT_EQ(0, getTimeZoneID("z")); EXPECT_EQ(0, getTimeZoneID("greenwich")); EXPECT_EQ(0, getTimeZoneID("ETC/GMT")); EXPECT_EQ(0, getTimeZoneID("ETC/GMT0"));