From b1834bde0f966969e535006c32939ac4ce787470 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Tue, 29 Oct 2024 12:51:02 -0700 Subject: [PATCH] Fix format_datetime to used linked TimeZone names (#11337) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11337 JODA has this concept of linked Time Zones where given a particular time zone ID, it will write out a different time zone ID when formatting a timestamp as a string (I suspect this is for historical reasons). More information is available about this here https://github.com/JodaOrg/global-tz This diff adds a script to parse the file JODA uses to get these links and turn it into a static map. I've done this for the file in the version of JODA Presto Java currently uses (there have been significant updates in more recent versions). Note that the file guarantees there are no transitive links so we don't need to handle that case. I then updated the format_datetime UDF to use this mapping for the ZZZ (or more) pattern. It will use the linked time zone ID in place of the time zone ID if one is available. This is also fixes casting TimestampWithTimeZone to Varchar which uses the same code path. Reviewed By: xiaoxmeng Differential Revision: D64870014 fbshipit-source-id: ff616c60f06118c157c5c91c05e5ccfc8fcc8698 --- velox/functions/lib/DateTimeFormatter.cpp | 19 +- .../prestosql/tests/DateTimeFunctionsTest.cpp | 11 ++ .../tests/TimestampWithTimeZoneCastTest.cpp | 3 +- velox/type/tz/CMakeLists.txt | 8 +- velox/type/tz/TimeZoneLinks.cpp | 186 ++++++++++++++++++ velox/type/tz/gen_timezone_links.py | 127 ++++++++++++ 6 files changed, 350 insertions(+), 4 deletions(-) create mode 100644 velox/type/tz/TimeZoneLinks.cpp create mode 100755 velox/type/tz/gen_timezone_links.py diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 32e1ab68f060..598146f28812 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -27,6 +27,11 @@ #include "velox/type/TimestampConversion.h" #include "velox/type/tz/TimeZoneMap.h" +namespace facebook::velox::tz { +// Defined in TimeZoneLinks.cpp +extern const std::unordered_map& getTimeZoneLinks(); +} // namespace facebook::velox::tz + namespace facebook::velox::functions { static thread_local std::string timezoneBuffer = "+00:00"; @@ -1234,7 +1239,9 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { if (timezone == nullptr) { VELOX_USER_FAIL("Timezone unknown"); } - size += timezone->name().length(); + + // The longest time zone ID is 32, America/Argentina/ComodRivadavia. + size += 32; } break; // Not supported. @@ -1494,6 +1501,16 @@ int32_t DateTimeFormatter::format( if (token.pattern.minRepresentDigits >= 3) { // Append the time zone ID. const auto& piece = timezone->name(); + + static const auto& timeZoneLinks = tz::getTimeZoneLinks(); + auto timeZoneLinksIter = timeZoneLinks.find(piece); + if (timeZoneLinksIter != timeZoneLinks.end()) { + const auto& timeZoneLink = timeZoneLinksIter->second; + std::memcpy(result, timeZoneLink.data(), timeZoneLink.length()); + result += timeZoneLink.length(); + break; + } + std::memcpy(result, piece.data(), piece.length()); result += piece.length(); break; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 9cce58653252..20265004aafa 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3340,6 +3340,17 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) { "Australian Central Western Standard Time", formatDatetime(parseTimestamp("1970-10-01"), "zzzz")); + // Test a time zone name that is linked to another (that gets replaced when + // converted to a string). + setQueryTimeZone("US/Pacific"); + EXPECT_EQ("PST", formatDatetime(parseTimestamp("1970-01-01"), "zzz")); + EXPECT_EQ( + "Pacific Standard Time", + formatDatetime(parseTimestamp("1970-01-01"), "zzzz")); + EXPECT_EQ( + "America/Los_Angeles", + formatDatetime(parseTimestamp("1970-01-01"), "ZZZ")); + setQueryTimeZone("Asia/Kolkata"); // Literal test cases. EXPECT_EQ("hello", formatDatetime(parseTimestamp("1970-01-01"), "'hello'")); diff --git a/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp b/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp index 6b9990ada924..ddd100c6b46a 100644 --- a/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp +++ b/velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp @@ -125,7 +125,8 @@ TEST_F(TimestampWithTimeZoneCastTest, toVarchar) { "1970-01-01 01:11:37.123 America/New_York", "1969-12-31 22:11:37.123 America/Los_Angeles", "1970-01-01 14:11:37.123 Asia/Shanghai", - "1970-01-01 11:41:37.123 Asia/Calcutta", + "1970-01-01 11:41:37.123 Asia/Kolkata", // Asia/Calcutta is linked to + // Asia/Kolkata. }); auto result = evaluate("cast(c0 as varchar)", makeRowVector({input})); diff --git a/velox/type/tz/CMakeLists.txt b/velox/type/tz/CMakeLists.txt index 614b594000a7..8f7c2381b823 100644 --- a/velox/type/tz/CMakeLists.txt +++ b/velox/type/tz/CMakeLists.txt @@ -14,8 +14,12 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -velox_add_library(velox_type_tz TimeZoneMap.h TimeZoneDatabase.cpp - TimeZoneMap.cpp) +velox_add_library( + velox_type_tz + TimeZoneMap.h + TimeZoneDatabase.cpp + TimeZoneLinks.cpp + TimeZoneMap.cpp) velox_link_libraries( velox_type_tz diff --git a/velox/type/tz/TimeZoneLinks.cpp b/velox/type/tz/TimeZoneLinks.cpp new file mode 100644 index 000000000000..de1e381ceb91 --- /dev/null +++ b/velox/type/tz/TimeZoneLinks.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is generated. Do not modify it manually. To re-generate it, run: +// +// ./velox/type/tz/gen_timezone_links.py -f /tmp/backward \ +// > velox/type/tz/TimeZoneLinks.cpp +// +// The backward file should be the same one used in JODA, +// The latest is available here : +// https://github.com/JodaOrg/global-tz/blob/global-tz/backward. +// Presto Java currently uses the one found here: +// https://github.com/JodaOrg/global-tz/blob/2024agtz/backward +// To find the current version in Presto Java, check this file at the version of +// JODA Presto Java is using: +// https://github.com/JodaOrg/joda-time/blob/v2.12.7/src/changes/changes.xml +// @generated + +#include +#include + +namespace facebook::velox::tz { + +const std::unordered_map& getTimeZoneLinks() { + static auto* tzLinks = new std::unordered_map([] { + // Work around clang compiler bug causing multi-hour compilation + // with -fsanitize=fuzzer + // https://github.com/llvm/llvm-project/issues/75666 + return std::unordered_map{ + {"Australia/ACT", "Australia/Sydney"}, + {"Australia/LHI", "Australia/Lord_Howe"}, + {"Australia/NSW", "Australia/Sydney"}, + {"Australia/North", "Australia/Darwin"}, + {"Australia/Queensland", "Australia/Brisbane"}, + {"Australia/South", "Australia/Adelaide"}, + {"Australia/Tasmania", "Australia/Hobart"}, + {"Australia/Victoria", "Australia/Melbourne"}, + {"Australia/West", "Australia/Perth"}, + {"Australia/Yancowinna", "Australia/Broken_Hill"}, + {"Brazil/Acre", "America/Rio_Branco"}, + {"Brazil/DeNoronha", "America/Noronha"}, + {"Brazil/East", "America/Sao_Paulo"}, + {"Brazil/West", "America/Manaus"}, + {"Canada/Atlantic", "America/Halifax"}, + {"Canada/Central", "America/Winnipeg"}, + {"Canada/Eastern", "America/Toronto"}, + {"Canada/Mountain", "America/Edmonton"}, + {"Canada/Newfoundland", "America/St_Johns"}, + {"Canada/Pacific", "America/Vancouver"}, + {"Canada/Saskatchewan", "America/Regina"}, + {"Canada/Yukon", "America/Whitehorse"}, + {"Chile/Continental", "America/Santiago"}, + {"Chile/EasterIsland", "Pacific/Easter"}, + {"Cuba", "America/Havana"}, + {"Egypt", "Africa/Cairo"}, + {"Eire", "Europe/Dublin"}, + {"Etc/GMT+0", "Etc/GMT"}, + {"Etc/GMT-0", "Etc/GMT"}, + {"Etc/GMT0", "Etc/GMT"}, + {"Etc/Greenwich", "Etc/GMT"}, + {"Etc/UCT", "Etc/UTC"}, + {"Etc/Universal", "Etc/UTC"}, + {"Etc/Zulu", "Etc/UTC"}, + {"GB", "Europe/London"}, + {"GB-Eire", "Europe/London"}, + {"GMT+0", "Etc/GMT"}, + {"GMT-0", "Etc/GMT"}, + {"GMT0", "Etc/GMT"}, + {"Greenwich", "Etc/GMT"}, + {"Hongkong", "Asia/Hong_Kong"}, + {"Iceland", "Atlantic/Reykjavik"}, + {"Iran", "Asia/Tehran"}, + {"Israel", "Asia/Jerusalem"}, + {"Jamaica", "America/Jamaica"}, + {"Japan", "Asia/Tokyo"}, + {"Kwajalein", "Pacific/Kwajalein"}, + {"Libya", "Africa/Tripoli"}, + {"Mexico/BajaNorte", "America/Tijuana"}, + {"Mexico/BajaSur", "America/Mazatlan"}, + {"Mexico/General", "America/Mexico_City"}, + {"NZ", "Pacific/Auckland"}, + {"NZ-CHAT", "Pacific/Chatham"}, + {"Navajo", "America/Denver"}, + {"PRC", "Asia/Shanghai"}, + {"Poland", "Europe/Warsaw"}, + {"Portugal", "Europe/Lisbon"}, + {"ROC", "Asia/Taipei"}, + {"ROK", "Asia/Seoul"}, + {"Singapore", "Asia/Singapore"}, + {"Turkey", "Europe/Istanbul"}, + {"UCT", "Etc/UTC"}, + {"US/Alaska", "America/Anchorage"}, + {"US/Aleutian", "America/Adak"}, + {"US/Arizona", "America/Phoenix"}, + {"US/Central", "America/Chicago"}, + {"US/East-Indiana", "America/Indiana/Indianapolis"}, + {"US/Eastern", "America/New_York"}, + {"US/Hawaii", "Pacific/Honolulu"}, + {"US/Indiana-Starke", "America/Indiana/Knox"}, + {"US/Michigan", "America/Detroit"}, + {"US/Mountain", "America/Denver"}, + {"US/Pacific", "America/Los_Angeles"}, + {"US/Samoa", "Pacific/Pago_Pago"}, + {"UTC", "Etc/UTC"}, + {"Universal", "Etc/UTC"}, + {"W-SU", "Europe/Moscow"}, + {"Zulu", "Etc/UTC"}, + {"America/Buenos_Aires", "America/Argentina/Buenos_Aires"}, + {"America/Catamarca", "America/Argentina/Catamarca"}, + {"America/Cordoba", "America/Argentina/Cordoba"}, + {"America/Indianapolis", "America/Indiana/Indianapolis"}, + {"America/Jujuy", "America/Argentina/Jujuy"}, + {"America/Knox_IN", "America/Indiana/Knox"}, + {"America/Louisville", "America/Kentucky/Louisville"}, + {"America/Mendoza", "America/Argentina/Mendoza"}, + {"America/Virgin", "America/St_Thomas"}, + {"Pacific/Samoa", "Pacific/Pago_Pago"}, + {"Africa/Timbuktu", "Africa/Bamako"}, + {"America/Argentina/ComodRivadavia", "America/Argentina/Catamarca"}, + {"America/Atka", "America/Adak"}, + {"America/Coral_Harbour", "America/Atikokan"}, + {"America/Ensenada", "America/Tijuana"}, + {"America/Fort_Wayne", "America/Indiana/Indianapolis"}, + {"America/Montreal", "America/Toronto"}, + {"America/Nipigon", "America/Toronto"}, + {"America/Pangnirtung", "America/Iqaluit"}, + {"America/Porto_Acre", "America/Rio_Branco"}, + {"America/Rainy_River", "America/Winnipeg"}, + {"America/Rosario", "America/Argentina/Cordoba"}, + {"America/Santa_Isabel", "America/Tijuana"}, + {"America/Shiprock", "America/Denver"}, + {"America/Thunder_Bay", "America/Toronto"}, + {"America/Yellowknife", "America/Edmonton"}, + {"Antarctica/South_Pole", "Antarctica/McMurdo"}, + {"Asia/Chongqing", "Asia/Shanghai"}, + {"Asia/Harbin", "Asia/Shanghai"}, + {"Asia/Kashgar", "Asia/Urumqi"}, + {"Asia/Tel_Aviv", "Asia/Jerusalem"}, + {"Atlantic/Jan_Mayen", "Europe/Oslo"}, + {"Australia/Canberra", "Australia/Sydney"}, + {"Australia/Currie", "Australia/Hobart"}, + {"Europe/Belfast", "Europe/London"}, + {"Europe/Tiraspol", "Europe/Chisinau"}, + {"Europe/Uzhgorod", "Europe/Kyiv"}, + {"Europe/Zaporozhye", "Europe/Kyiv"}, + {"Pacific/Enderbury", "Pacific/Kanton"}, + {"Pacific/Johnston", "Pacific/Honolulu"}, + {"Pacific/Yap", "Pacific/Chuuk"}, + {"Africa/Asmera", "Africa/Asmara"}, + {"America/Godthab", "America/Nuuk"}, + {"Asia/Ashkhabad", "Asia/Ashgabat"}, + {"Asia/Calcutta", "Asia/Kolkata"}, + {"Asia/Chungking", "Asia/Shanghai"}, + {"Asia/Dacca", "Asia/Dhaka"}, + {"Asia/Istanbul", "Europe/Istanbul"}, + {"Asia/Katmandu", "Asia/Kathmandu"}, + {"Asia/Macao", "Asia/Macau"}, + {"Asia/Rangoon", "Asia/Yangon"}, + {"Asia/Saigon", "Asia/Ho_Chi_Minh"}, + {"Asia/Thimbu", "Asia/Thimphu"}, + {"Asia/Ujung_Pandang", "Asia/Makassar"}, + {"Asia/Ulan_Bator", "Asia/Ulaanbaatar"}, + {"Atlantic/Faeroe", "Atlantic/Faroe"}, + {"Europe/Kiev", "Europe/Kyiv"}, + {"Europe/Nicosia", "Asia/Nicosia"}, + {"Pacific/Ponape", "Pacific/Pohnpei"}, + {"Pacific/Truk", "Pacific/Chuuk"}, + }; + }()); + return *tzLinks; +} + +} // namespace facebook::velox::tz diff --git a/velox/type/tz/gen_timezone_links.py b/velox/type/tz/gen_timezone_links.py new file mode 100755 index 000000000000..6b414b247c01 --- /dev/null +++ b/velox/type/tz/gen_timezone_links.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +from string import Template + +cpp_template = Template( + """\ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is generated. Do not modify it manually. To re-generate it, run: +// +// ./velox/type/tz/gen_timezone_links.py -f /tmp/backward \\ +// > velox/type/tz/TimeZoneLinks.cpp +// +// The backward file should be the same one used in JODA, +// The latest is available here : +// https://github.com/JodaOrg/global-tz/blob/global-tz/backward. +// Presto Java currently uses the one found here: +// https://github.com/JodaOrg/global-tz/blob/2024agtz/backward +// To find the current version in Presto Java, check this file at the version of +// JODA Presto Java is using: +// https://github.com/JodaOrg/joda-time/blob/v2.12.7/src/changes/changes.xml +// @generated + +#include +#include + +namespace facebook::velox::tz { + +const std::unordered_map& getTimeZoneLinks() { + static auto* tzLinks = new std::unordered_map([] { + // Work around clang compiler bug causing multi-hour compilation + // with -fsanitize=fuzzer + // https://github.com/llvm/llvm-project/issues/75666 + return std::unordered_map{ +$entries + }; + }()); + return *tzLinks; +} + +} // namespace facebook::velox::tz\ +""" +) + +entry_template = Template(' {"$tz_key", "$tz_value"},') + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Reads an input file (specified by -f) containing mappings from " + "time zone names to the names that should replace them and prints the " + "corresponding `TimeZoneLinks.cpp` file.", + epilog="(c) Facebook 2004-present", + ) + parser.add_argument( + "-f", + "--file-input", + required=True, + help="Input timezone links file.", + ) + return parser.parse_args() + + +def main(): + args = parse_arguments() + entries = [] + + # Read an input file with links following the format: + # + # > Link Australia/Sydney Australia/NSW + # > Link Australia/Darwin Australia/North + # > Link Australia/Brisbane Australia/Queensland + # > ... + # + # Possible containing comments (everything in a line after # is ignored) + + with open(args.file_input) as file_in: + for line in file_in: + line = line.strip() + + columns = line.split() + if len(columns) == 0: + continue + + if columns[0] != "Link": + continue + + entries.append( + entry_template.substitute(tz_key=columns[2], tz_value=columns[1]) + ) + + print(cpp_template.substitute(entries="\n".join(entries))) + return 0 + + +if __name__ == "__main__": + sys.exit(main())