Fix TRY_CAST(VARCHAR as TIMESTAMP) incorrectly suppress errors of inp…

…ut outside supported ranges (facebookincubator#10928) Summary: Pull Request resolved: facebookincubator#10928 Similar to facebookincubator#10097, force CAST(VARCHAR as TIMESTAMP) to throw VeloxRuntimeErrors on inputs outside the supported ranges, so that TRY_CAST doesn't suppress them. Reviewed By: amitkdutta, kgpai Differential Revision: D62196915 fbshipit-source-id: ef6237a4b69fee38a9425710b0102aeab1dfbe87
Yuhta · Sep 10, 2024 · da48a08 · da48a08
1 parent 38f9a1f
commit da48a08
Show file tree

Hide file tree

Showing 7 changed files with 77 additions and 8 deletions.
diff --git a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp
@@ -76,6 +76,13 @@ int main(int argc, char** argv) {
       "regexp_like",
       "regexp_replace",
       "regexp_split",
+      // date_format and format_datetime throw VeloxRuntimeError when input
+      // timestamp is out of the supported range.
+      "date_format",
+      "format_datetime",
+      // from_unixtime can generate timestamps out of the supported range that
+      // make other functions throw VeloxRuntimeErrors.
+      "from_unixtime",
   };
   size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed;
 

diff --git a/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp b/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp
@@ -62,7 +62,14 @@ int main(int argc, char** argv) {
       "chr",
       "replace",
       "might_contain",
-      "unix_timestamp"};
+      "unix_timestamp",
+      // from_unixtime throws VeloxRuntimeError when the timestamp is out of the
+      // supported range.
+      "from_unixtime",
+      // timestamp_millis(bigint) can generate timestamps out of the supported
+      // range that make other functions throw VeloxRuntimeErrors.
+      "timestamp_millis(bigint) -> timestamp",
+  };
 
   // Required by spark_partition_id function.
   std::unordered_map<std::string, std::string> queryConfigs = {

diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp
@@ -581,10 +581,27 @@ TEST_F(CastExprTest, stringToTimestamp) {
   };
   testCast<std::string, Timestamp>("timestamp", input, expected);
 
+  // Test invalid inputs.
   VELOX_ASSERT_THROW(
       (evaluateOnce<Timestamp, std::string>(
           "cast(c0 as timestamp)", "1970-01-01T00:00")),
       "Cannot cast VARCHAR '1970-01-01T00:00' to TIMESTAMP. Unable to parse timestamp value");
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<Timestamp, std::string>(
+          "cast(c0 as timestamp)", "201915-04-23 11:46:00.000")),
+      "Timepoint is outside of supported year range");
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<Timestamp, std::string>(
+          "try_cast(c0 as timestamp)", "201915-04-23 11:46:00.000")),
+      "Timepoint is outside of supported year range");
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<Timestamp, std::string>(
+          "cast(c0 as timestamp)", "2045-12-31 18:00:00")),
+      "Unable to convert timezone 'America/Los_Angeles' past 2037-11-01 09:00:00");
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<Timestamp, std::string>(
+          "try_cast(c0 as timestamp)", "2045-12-31 18:00:00")),
+      "Unable to convert timezone 'America/Los_Angeles' past 2037-11-01 09:00:00");
 
   setLegacyCast(true);
   input = {

diff --git a/velox/functions/lib/tests/DateTimeFormatterTest.cpp b/velox/functions/lib/tests/DateTimeFormatterTest.cpp
@@ -1447,10 +1447,10 @@ TEST_F(MysqlDateTimeTest, formatYear) {
       "-0001");
   EXPECT_THROW(
       formatMysqlDateTime("%Y", fromTimestampString("-99999-01-01"), timezone),
-      VeloxUserError);
+      VeloxRuntimeError);
   EXPECT_THROW(
       formatMysqlDateTime("%Y", fromTimestampString("99999-01-01"), timezone),
-      VeloxUserError);
+      VeloxRuntimeError);
 }
 
 TEST_F(MysqlDateTimeTest, formatMonthDay) {

diff --git a/velox/type/Timestamp.cpp b/velox/type/Timestamp.cpp
@@ -56,6 +56,11 @@ void Timestamp::toGMT(const tz::TimeZone& zone) {
   } catch (const date::nonexistent_local_time& error) {
     // If the time does not exist, fail the conversion.
     VELOX_USER_FAIL(error.what());
+  } catch (const std::invalid_argument& e) {
+    // Invalid argument means we hit a conversion not supported by
+    // external/date. Need to throw a RuntimeError so that try() statements do
+    // not suppress it.
+    VELOX_FAIL(e.what());
   }
   seconds_ = sysSeconds.count();
 }

diff --git a/velox/type/tz/TimeZoneMap.cpp b/velox/type/tz/TimeZoneMap.cpp
@@ -209,7 +209,7 @@ void validateRangeImpl(time_point<TDuration> timePoint) {
   auto year = year_month_day(floor<days>(timePoint)).year();
 
   if (year < kMinYear || year > kMaxYear) {
-    VELOX_USER_FAIL(
+    VELOX_FAIL(
         "Timepoint is outside of supported year range: [{}, {}], got {}",
         (int)kMinYear,
         (int)kMaxYear,

diff --git a/velox/vector/fuzzer/VectorFuzzer.cpp b/velox/vector/fuzzer/VectorFuzzer.cpp
@@ -118,20 +118,44 @@ int128_t rand(FuzzerGenerator& rng) {
   return HugeInt::build(rand<int64_t>(rng), rand<uint64_t>(rng));
 }
 
+template <typename T, typename std::enable_if_t<std::is_integral_v<T>, int> = 0>
+T rand(FuzzerGenerator& rng, T min, T max) {
+  return boost::random::uniform_int_distribution<T>(min, max)(rng);
+}
+
 Timestamp randTimestamp(FuzzerGenerator& rng, VectorFuzzer::Options opts) {
+  // Generate timestamps only in the valid range to avoid datetime functions,
+  // such as try_cast(varchar as timestamp), throwing VeloxRuntimeError in
+  // fuzzers.
+  constexpr int64_t min = -2'140'671'600;
+  constexpr int64_t max = 2'140'671'600;
+  constexpr int64_t microInSecond = 1'000'000;
+  constexpr int64_t millisInSecond = 1'000;
+
   switch (opts.timestampPrecision) {
     case VectorFuzzer::Options::TimestampPrecision::kNanoSeconds:
-      return Timestamp(rand<int32_t>(rng), (rand<int64_t>(rng) % MAX_NANOS));
+      return Timestamp(
+          rand<int64_t>(rng, min, max), (rand<int64_t>(rng) % MAX_NANOS));
     case VectorFuzzer::Options::TimestampPrecision::kMicroSeconds:
-      return Timestamp::fromMicros(rand<int64_t>(rng));
+      return Timestamp::fromMicros(
+          rand<int64_t>(rng, min, max) * microInSecond +
+          rand<int64_t>(rng, -microInSecond, microInSecond));
     case VectorFuzzer::Options::TimestampPrecision::kMilliSeconds:
-      return Timestamp::fromMillis(rand<int64_t>(rng));
+      return Timestamp::fromMillis(
+          rand<int64_t>(rng, min, max) * millisInSecond +
+          rand<int64_t>(rng, -millisInSecond, millisInSecond));
     case VectorFuzzer::Options::TimestampPrecision::kSeconds:
-      return Timestamp(rand<int32_t>(rng), 0);
+      return Timestamp(rand<int64_t>(rng, min, max), 0);
   }
   return {}; // no-op.
 }
 
+int32_t randDate(FuzzerGenerator& rng) {
+  constexpr int64_t min = -24'450;
+  constexpr int64_t max = 24'450;
+  return rand<int32_t>(rng, min, max);
+}
+
 size_t getElementsVectorLength(
     const VectorFuzzer::Options& opts,
     vector_size_t size) {
@@ -251,6 +275,9 @@ VectorPtr fuzzConstantPrimitiveImpl(
   if constexpr (std::is_same_v<TCpp, Timestamp>) {
     return std::make_shared<ConstantVector<TCpp>>(
         pool, size, false, type, randTimestamp(rng, opts));
+  } else if (type->isDate()) {
+    return std::make_shared<ConstantVector<int32_t>>(
+        pool, size, false, type, randDate(rng));
   } else if (type->isShortDecimal()) {
     return std::make_shared<ConstantVector<int64_t>>(
         pool, size, false, type, randShortDecimal(type, rng));
@@ -294,6 +321,12 @@ void fuzzFlatPrimitiveImpl(
       } else {
         VELOX_NYI();
       }
+    } else if constexpr (std::is_same_v<TCpp, int32_t>) {
+      if (vector->type()->isDate()) {
+        flatVector->set(i, randDate(rng));
+      } else {
+        flatVector->set(i, rand<TCpp>(rng));
+      }
     } else {
       flatVector->set(i, rand<TCpp>(rng));
     }