Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor internal time zone database #10572

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions velox/type/tz/TimeZoneDatabase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@

namespace facebook::velox::tz {

const std::unordered_map<int64_t, std::string>& getTimeZoneDB() {
static auto* tzDB = new std::unordered_map<int64_t, std::string>([] {
const std::vector<std::pair<int16_t, std::string>>& getTimeZoneEntries() {
static auto* tzDB = new std::vector<std::pair<int16_t, std::string>>([] {
// Work around clang compiler bug causing multi-hour compilation
// with -fsanitize=fuzzer
// https://github.com/llvm/llvm-project/issues/75666
std::vector<std::pair<int64_t, std::string>> entries = {
return std::vector<std::pair<int16_t, std::string>>{
{0, "+00:00"},
{1, "-14:00"},
{2, "-13:59"},
Expand Down Expand Up @@ -2266,8 +2266,6 @@ const std::unordered_map<int64_t, std::string>& getTimeZoneDB() {
{2232, "Europe/Kyiv"},
{2233, "America/Ciudad_Juarez"},
};
return std::unordered_map<int64_t, std::string>(
entries.begin(), entries.end());
}());
return *tzDB;
}
Expand Down
80 changes: 61 additions & 19 deletions velox/type/tz/TimeZoneMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,56 @@
namespace facebook::velox::tz {

// Defined in TimeZoneDatabase.cpp
extern const std::unordered_map<int64_t, std::string>& getTimeZoneDB();
extern const std::vector<std::pair<int16_t, std::string>>& getTimeZoneEntries();

// TODO: The string will be moved to TimeZone in the next PR.
using TTimeZoneDatabase = std::vector<std::unique_ptr<std::string>>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you use unique_pr<string>?

Copy link
Contributor Author

@pedroerp pedroerp Jul 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the next PR I'll move this to unique_ptr<TimeZone>, but I worried it would make this PR too large to review, so this seemed like a good compromise. For now unique_ptr<string> because I need to represent that some entries are empty (nullptr).

using TTimeZoneIndex = folly::F14FastMap<std::string, int16_t>;

namespace {

folly::F14FastMap<std::string, int64_t> makeReverseMap(
const std::unordered_map<int64_t, std::string>& map) {
folly::F14FastMap<std::string, int64_t> reversed;
reversed.reserve(map.size() + 1);
// Flattens the input vector of pairs into a vector, assuming that the
// timezoneIDs are (mostly) sequential. Note that since they are "mostly"
// senquential, the vector can have holes. But it is still more efficient than
// looking up on a map.
TTimeZoneDatabase buildTimeZoneDatabase(
const std::vector<std::pair<int16_t, std::string>>& dbInput) {
TTimeZoneDatabase tzDatabase;
tzDatabase.resize(dbInput.back().first + 1);

for (const auto& entry : dbInput) {
tzDatabase[entry.first] = std::make_unique<std::string>(entry.second);
}
return tzDatabase;
}

const TTimeZoneDatabase& getTimeZoneDatabase() {
static TTimeZoneDatabase timeZoneDatabase =
buildTimeZoneDatabase(getTimeZoneEntries());
return timeZoneDatabase;
}

// Reverses the vector of pairs into a map key'ed by the timezone name for
// reverse look ups.
TTimeZoneIndex buildTimeZoneIndex(const TTimeZoneDatabase& tzDatabase) {
TTimeZoneIndex reversed;
reversed.reserve(tzDatabase.size() + 1);

for (const auto& entry : map) {
reversed.emplace(
boost::algorithm::to_lower_copy(entry.second), entry.first);
for (int16_t i = 0; i < tzDatabase.size(); ++i) {
if (tzDatabase[i] != nullptr) {
reversed.emplace(boost::algorithm::to_lower_copy(*tzDatabase[i]), i);
}
}
reversed.emplace("utc", 0);
return reversed;
}

const TTimeZoneIndex& getTimeZoneIndex() {
static TTimeZoneIndex timeZoneIndex =
buildTimeZoneIndex(getTimeZoneDatabase());
return timeZoneIndex;
}

inline bool isDigit(char c) {
return c >= '0' && c <= '9';
}
Expand Down Expand Up @@ -111,28 +144,37 @@ std::string normalizeTimeZone(const std::string& originalZoneId) {
} // namespace

std::string getTimeZoneName(int64_t timeZoneID) {
const auto& tzDB = getTimeZoneDB();
auto it = tzDB.find(timeZoneID);
VELOX_CHECK(
it != tzDB.end(), "Unable to resolve timeZoneID '{}'", timeZoneID);
return it->second;
const auto& timeZoneDatabase = getTimeZoneDatabase();

VELOX_CHECK_LT(
timeZoneID,
timeZoneDatabase.size(),
"Unable to resolve timeZoneID '{}'",
timeZoneID);

// Check if timeZoneID is not one of the "holes".
VELOX_CHECK_NOT_NULL(
timeZoneDatabase[timeZoneID],
"Unable to resolve timeZoneID '{}'",
timeZoneID);
return *timeZoneDatabase[timeZoneID];
}

int16_t getTimeZoneID(std::string_view timeZone, bool failOnError) {
static folly::F14FastMap<std::string, int64_t> nameToIdMap =
makeReverseMap(getTimeZoneDB());
const auto& timeZoneIndex = getTimeZoneIndex();

std::string timeZoneLowered;
boost::algorithm::to_lower_copy(
std::back_inserter(timeZoneLowered), timeZone);

auto it = nameToIdMap.find(timeZoneLowered);
if (it != nameToIdMap.end()) {
auto it = timeZoneIndex.find(timeZoneLowered);
if (it != timeZoneIndex.end()) {
return it->second;
}

// If an exact match wasn't found, try to normalize the timezone name.
it = nameToIdMap.find(normalizeTimeZone(timeZoneLowered));
if (it != nameToIdMap.end()) {
it = timeZoneIndex.find(normalizeTimeZone(timeZoneLowered));
if (it != timeZoneIndex.end()) {
return it->second;
}
if (failOnError) {
Expand Down
8 changes: 3 additions & 5 deletions velox/type/tz/gen_timezone_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,14 @@

namespace facebook::velox::util {

const std::unordered_map<int64_t, std::string>& getTimeZoneDB() {
static auto* tzDB = new std::unordered_map<int64_t, std::string>([] {
const std::vector<std::pair<int16_t, std::string>>& getTimeZoneEntries() {
static auto* tzDB = new std::vector<std::pair<int16_t, std::string>>([] {
// Work around clang compiler bug causing multi-hour compilation
// with -fsanitize=fuzzer
// https://github.com/llvm/llvm-project/issues/75666
std::vector<std::pair<int64_t, std::string>> entries = {
return std::vector<std::pair<int16_t, std::string>>{
$entries
};
return std::unordered_map<int64_t, std::string>(
entries.begin(), entries.end());
}());
return *tzDB;
}
Expand Down
1 change: 1 addition & 0 deletions velox/type/tz/tests/TimeZoneMapTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ TEST(TimeZoneMapTest, getTimeZoneID) {
EXPECT_EQ(0, getTimeZoneID("UTC"));
EXPECT_EQ(0, getTimeZoneID("GMT"));
EXPECT_EQ(0, getTimeZoneID("Z"));
EXPECT_EQ(0, getTimeZoneID("z"));
EXPECT_EQ(0, getTimeZoneID("greenwich"));
EXPECT_EQ(0, getTimeZoneID("ETC/GMT"));
EXPECT_EQ(0, getTimeZoneID("ETC/GMT0"));
Expand Down
Loading