fix

kevinwilfong · Oct 22, 2024 · 1f18e8f · 1f18e8f
1 parent c21d511
commit 1f18e8f
Show file tree

Hide file tree

Showing 5 changed files with 244 additions and 33 deletions.
diff --git a/velox/external/date/patches/0006-add_get_time_zone_names.patch b/velox/external/date/patches/0006-add_get_time_zone_names.patch
@@ -0,0 +1,30 @@
+diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp
+--- a/velox/external/date/tz.cpp
++++ b/velox/external/date/tz.cpp
+@@ -3538,6 +3538,14 @@
+     return get_tzdb_list().front();
+ }
+
++std::vector<std::string> get_time_zone_names() {
++  std::vector<std::string> result;
++  for (const auto& z : get_tzdb().zones) {
++    result.push_back(z.name());
++  }
++  return result;
++}
++
+ const time_zone*
+ #if HAS_STRING_VIEW
+ tzdb::locate_zone(std::string_view tz_name) const
+diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h
+--- a/velox/external/date/tz.h
++++ b/velox/external/date/tz.h
+@@ -1258,6 +1258,8 @@
+
+ DATE_API const tzdb& get_tzdb();
+
++std::vector<std::string> get_time_zone_names();
++
+ class tzdb_list
+ {
+     std::atomic<tzdb*> head_{nullptr};
diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp
@@ -3538,6 +3538,14 @@ get_tzdb()
     return get_tzdb_list().front();
 }
 
+std::vector<std::string> get_time_zone_names() {
+  std::vector<std::string> result;
+  for (const auto& z : get_tzdb().zones) {
+    result.push_back(z.name());
+  }
+  return result;
+}
+
 const time_zone*
 #if HAS_STRING_VIEW
 tzdb::locate_zone(std::string_view tz_name) const

diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h
@@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db);
 
 DATE_API const tzdb& get_tzdb();
 
+std::vector<std::string> get_time_zone_names();
+
 class tzdb_list
 {
     std::atomic<tzdb*> head_{nullptr};

diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp
@@ -347,6 +347,126 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) {
   return -1;
 }
 
+// Contains a list of all time zone names in a convenient format for searching.
+//
+// Time zone names without the '/' character (without a prefix) are stored in
+// timeZoneNamesWithoutPrefix ordered by size desc.
+//
+// Time zone names with the '/' character (with a prefix) are stored in a map
+// timeZoneNamePrefixMap from prefix (the string before the first '/') to a
+// vector of strings which contains the suffixes (the strings after the first
+// '/') ordered by size desc.
+struct TimeZoneNameMappings {
+  std::vector<std::string> timeZoneNamesWithoutPrefix;
+  std::unordered_map<std::string, std::vector<std::string>>
+      timeZoneNamePrefixMap;
+};
+
+TimeZoneNameMappings getTimeZoneNameMappings() {
+  // Here we use get_time_zone_names instead of calling get_tzdb and
+  // constructing the list ourselves because there is some unknown issue with
+  // the tz library where the time_zone objects after the first one in the tzdb
+  // will be invalid (contain nullptrs) after the get_tzdb function returns.
+  const std::vector<std::string> timeZoneNames = date::get_time_zone_names();
+
+  TimeZoneNameMappings result;
+  for (size_t i = 0; i < timeZoneNames.size(); i++) {
+    const auto& timeZoneName = timeZoneNames[i];
+    auto separatorPoint = timeZoneName.find('/');
+
+    if (separatorPoint == std::string::npos) {
+      result.timeZoneNamesWithoutPrefix.push_back(timeZoneName);
+    } else {
+      std::string prefix = timeZoneName.substr(0, separatorPoint);
+      std::string suffix = timeZoneName.substr(separatorPoint + 1);
+
+      result.timeZoneNamePrefixMap[prefix].push_back(suffix);
+    }
+  }
+
+  std::sort(
+      result.timeZoneNamesWithoutPrefix.begin(),
+      result.timeZoneNamesWithoutPrefix.end(),
+      [](const std::string& a, const std::string& b) {
+        return b.size() < a.size();
+      });
+
+  for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) {
+    std::sort(
+        suffixes.begin(),
+        suffixes.end(),
+        [](const std::string& a, const std::string& b) {
+          return b.size() < a.size();
+        });
+  }
+
+  return result;
+}
+
+int64_t parseTimezoneName(const char* cur, const char* end, Date& date) {
+  static const TimeZoneNameMappings timeZoneNameMappings =
+      getTimeZoneNameMappings();
+
+  if (cur < end) {
+    // Find the first instance of '/' in the remainder of the string
+    const char* separatorPoint = cur;
+    while (separatorPoint < end && *separatorPoint != '/') {
+      ++separatorPoint;
+    }
+
+    // Try to find a time zone with a prefix that includes the speratorPoint.
+    if (separatorPoint != end) {
+      std::string prefix(cur, separatorPoint);
+
+      auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix);
+      if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) {
+        // This is greedy, find the longest suffix for the given prefix that
+        // fits the string. We know the value in the map is already sorted by
+        // length in decreasing order.
+        for (const auto& suffixName : it->second) {
+          if (suffixName.size() <= end - separatorPoint - 1 &&
+              suffixName ==
+                  std::string_view(separatorPoint + 1, suffixName.size())) {
+            auto timeZoneNameSize = prefix.size() + 1 + suffixName.size();
+            date.timezone =
+                tz::locateZone(std::string_view(cur, timeZoneNameSize), false);
+
+            if (!date.timezone) {
+              return -1;
+            }
+
+            return timeZoneNameSize;
+          }
+        }
+      }
+    }
+
+    // If we found a '/' but didn't find a match in the set of time zones with
+    // prefixes, try search before the '/' for a time zone without a prefix. If
+    // we didn't find a '/' then end already equals separatorPoint.
+    end = separatorPoint;
+
+    for (const auto& timeZoneName :
+         timeZoneNameMappings.timeZoneNamesWithoutPrefix) {
+      // Again, this is greedy, find the largest time zone name without a prefix
+      // that fits the string. We know timeZoneNamesWithoutPrefix is already
+      // sorted by length in decreasing order.
+      if (timeZoneName.size() <= end - cur &&
+          timeZoneName == std::string_view(cur, timeZoneName.size())) {
+        date.timezone = tz::locateZone(timeZoneName, false);
+
+        if (!date.timezone) {
+          return -1;
+        }
+
+        return timeZoneName.size();
+      }
+    }
+  }
+
+  return -1;
+}
+
 int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) {
   // For timezone offset ids, there are three formats allowed by Joda:
   //
@@ -639,8 +759,8 @@ int getMaxDigitConsume(
         return curPattern.minRepresentDigits;
       } else {
         if (type == DateTimeFormatterType::MYSQL) {
-          // MySQL format will try to read in at most 4 digits when supplied a
-          // year, never more.
+          // MySQL format will try to read in at most 4 digits when
+          // supplied a year, never more.
           return 4;
         }
         return curPattern.minRepresentDigits > 9 ? curPattern.minRepresentDigits
@@ -681,7 +801,13 @@ int32_t parseFromPattern(
     bool specifierNext,
     DateTimeFormatterType type) {
   if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) {
-    auto size = parseTimezoneOffset(cur, end, date);
+    int64_t size;
+    if (curPattern.minRepresentDigits < 3) {
+      size = parseTimezoneOffset(cur, end, date);
+    } else {
+      size = parseTimezoneName(cur, end, date);
+    }
+
     if (size == -1) {
       return -1;
     }
@@ -762,12 +888,13 @@ int32_t parseFromPattern(
          curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA ||
          curPattern.specifier == DateTimeFormatSpecifier::WEEK_YEAR) &&
         curPattern.minRepresentDigits == 2) {
-      // If abbreviated two year digit is provided in format string, try to read
-      // in two digits of year and convert to appropriate full length year The
-      // two-digit mapping is as follows: [00, 69] -> [2000, 2069]
+      // If abbreviated two year digit is provided in format string, try
+      // to read in two digits of year and convert to appropriate full
+      // length year The two-digit mapping is as follows: [00, 69] ->
+      // [2000, 2069]
       //                                  [70, 99] -> [1970, 1999]
-      // If more than two digits are provided, then simply read in full year
-      // normally without conversion
+      // If more than two digits are provided, then simply read in full
+      // year normally without conversion
       int count = 0;
       while (cur < end && cur < startPos + maxDigitConsume &&
              characterIsDigit(*cur)) {
@@ -782,8 +909,8 @@ int32_t parseFromPattern(
           number += 2000;
         }
       } else if (type == DateTimeFormatterType::MYSQL) {
-        // In MySQL format, year read in must have exactly two digits, otherwise
-        // return -1 to indicate parsing error.
+        // In MySQL format, year read in must have exactly two digits,
+        // otherwise return -1 to indicate parsing error.
         if (count > 2) {
           // Larger than expected, print suffix.
           cur = cur - count + 2;
@@ -813,7 +940,8 @@ int32_t parseFromPattern(
 
     switch (curPattern.specifier) {
       case DateTimeFormatSpecifier::CENTURY_OF_ERA:
-        // Enforce Joda's year range if year was specified as "century of year".
+        // Enforce Joda's year range if year was specified as "century of
+        // year".
         if (number < 0 || number > 2922789) {
           return -1;
         }
@@ -827,7 +955,8 @@ int32_t parseFromPattern(
         date.centuryFormat = false;
         date.isYearOfEra =
             (curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA);
-        // Enforce Joda's year range if year was specified as "year of era".
+        // Enforce Joda's year range if year was specified as "year of
+        // era".
         if (date.isYearOfEra && (number > 292278993 || number < 1)) {
           return -1;
         }
@@ -846,9 +975,9 @@ int32_t parseFromPattern(
         date.month = number;
         date.weekDateFormat = false;
         date.dayOfYearFormat = false;
-        // Joda has this weird behavior where it returns 1970 as the year by
-        // default (if no year is specified), but if either day or month are
-        // specified, it fallsback to 2000.
+        // Joda has this weird behavior where it returns 1970 as the year
+        // by default (if no year is specified), but if either day or
+        // month are specified, it fallsback to 2000.
         if (!date.hasYear) {
           date.hasYear = true;
           date.year = 2000;
@@ -860,9 +989,9 @@ int32_t parseFromPattern(
         date.day = number;
         date.weekDateFormat = false;
         date.dayOfYearFormat = false;
-        // Joda has this weird behavior where it returns 1970 as the year by
-        // default (if no year is specified), but if either day or month are
-        // specified, it fallsback to 2000.
+        // Joda has this weird behavior where it returns 1970 as the year
+        // by default (if no year is specified), but if either day or
+        // month are specified, it fallsback to 2000.
         if (!date.hasYear) {
           date.hasYear = true;
           date.year = 2000;
@@ -874,9 +1003,9 @@ int32_t parseFromPattern(
         date.dayOfYear = number;
         date.dayOfYearFormat = true;
         date.weekDateFormat = false;
-        // Joda has this weird behavior where it returns 1970 as the year by
-        // default (if no year is specified), but if either day or month are
-        // specified, it fallsback to 2000.
+        // Joda has this weird behavior where it returns 1970 as the year
+        // by default (if no year is specified), but if either day or
+        // month are specified, it fallsback to 2000.
         if (!date.hasYear) {
           date.hasYear = true;
           date.year = 2000;
@@ -1283,14 +1412,14 @@ int32_t DateTimeFormatter::format(
         } break;
 
         case DateTimeFormatSpecifier::TIMEZONE: {
-          // TODO: implement short name time zone, need a map from full name to
-          // short name
+          // TODO: implement short name time zone, need a map from full name
+          // to short name
           VELOX_UNSUPPORTED("time zone name is not yet supported");
         } break;
 
         case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: {
-          // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset
-          // with a colon, 'ZZZ' or more outputs the zone id.
+          // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the
+          // offset with a colon, 'ZZZ' or more outputs the zone id.
           if (offset == 0 && zeroOffsetText.has_value()) {
             std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size());
             result += zeroOffsetText->size();
@@ -1447,8 +1576,8 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildMysqlDateTimeFormatter(
         Status::UserError("Both printing and parsing not supported"));
   }
 
-  // For %r we should reserve 1 extra space because it has 3 literals ':' ':'
-  // and ' '
+  // For %r we should reserve 1 extra space because it has 3 literals ':'
+  // ':' and ' '
   DateTimeFormatterBuilder builder(
       format.size() + countOccurence(format, "%r"));
 
@@ -1724,9 +1853,9 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
   while (cur < end) {
     const char* startTokenPtr = cur;
 
-    // For literal case, literal should be quoted using single quotes ('). If
-    // there is no quotes, it is interpreted as pattern letters. If there is
-    // only single quote, a user error will be thrown.
+    // For literal case, literal should be quoted using single quotes (').
+    // If there is no quotes, it is interpreted as pattern letters. If there
+    // is only single quote, a user error will be thrown.
     if (*startTokenPtr == '\'') {
       // Append single literal quote for 2 consecutive single quote.
       if (cur + 1 < end && *(cur + 1) == '\'') {
@@ -1752,8 +1881,8 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
         cur += count + 2;
       }
     } else {
-      // Append format specifier according to pattern letters. If pattern letter
-      // is not supported, a user error will be thrown.
+      // Append format specifier according to pattern letters. If pattern
+      // letter is not supported, a user error will be thrown.
       int count = 1;
       ++cur;
       while (cur < end && *startTokenPtr == *cur) {