Change cast(double|real as varchar) to return scientific notation (#7602

) Summary: Change cast(double as varchar) and cast(real as varchar) to return scientific notation, when magnitude of the input value is greater than or equal to 10^7, or less than 10^-3. Today they return standard notation in all cases. Change to use fmt::format() instead of folly::to() to output standard notation, for less noisy digits in outputs, when magnitude of the input value is greater than or equal to 10^-3 and less than 10^7. Use QueryConfig legacy_cast false to gate this new behavior. To keep existing behavior, set legacy_cast to true. The returned scientific notation uses a max of 17 significant decimal digits for double and a max of 8 decimal digits for real, according to the double and single precision supported by IEEE 754 floating point, as well as Presto empirically. Also add 4 benchmarks for casting double or real to scientific notation or standard notation. The time of casting for legacy_cast = false, new implementation is on par with the time for legacy_cast = true, today implementation. Pull Request resolved: #7602 Reviewed By: kagamiori Differential Revision: D51386874 Pulled By: gggrace14 fbshipit-source-id: 84286074864b8a817d0f3feb97d498b4fe7c6c66
facebookincubator · Dec 5, 2023 · 14dc44b · 14dc44b
1 parent a5d816e
commit 14dc44b
Show file tree

Hide file tree

Showing 4 changed files with 322 additions and 6 deletions.
diff --git a/velox/benchmarks/basic/CastBenchmark.cpp b/velox/benchmarks/basic/CastBenchmark.cpp
@@ -46,6 +46,14 @@ int main(int argc, char** argv) {
       },
       nullptr,
       DECIMAL(38, 16));
+  auto largeRealInput = vectorMaker.flatVector<float>(
+      vectorSize, [&](auto j) { return 12345678.0 * j; });
+  auto smallRealInput = vectorMaker.flatVector<float>(
+      vectorSize, [&](auto j) { return 1.2345678 * j; });
+  auto smallDoubleInput = vectorMaker.flatVector<double>(
+      vectorSize, [&](auto j) { return -0.00012345678 / j; });
+  auto largeDoubleInput = vectorMaker.flatVector<double>(
+      vectorSize, [&](auto j) { return -123456.7 / j; });
   auto timestampInput =
       vectorMaker.flatVector<Timestamp>(vectorSize, [&](auto j) {
         return Timestamp(1695859694 + j / 1000, j % 1000 * 1'000'000);
@@ -71,13 +79,21 @@ int main(int argc, char** argv) {
                "decimal",
                "short_decimal",
                "long_decimal",
+               "large_real",
+               "small_real",
+               "small_double",
+               "large_double",
                "timestamp"},
               {validInput,
                invalidInput,
                nanInput,
                decimalInput,
                shortDecimalInput,
                longDecimalInput,
+               largeRealInput,
+               smallRealInput,
+               smallDoubleInput,
+               largeDoubleInput,
                timestampInput}))
       .addExpression("try_cast_invalid_empty_input", "try_cast (empty as int) ")
       .addExpression(
@@ -91,6 +107,17 @@ int main(int argc, char** argv) {
           "cast_decimal_to_inline_string", "cast (decimal as varchar)")
       .addExpression("cast_short_decimal", "cast (short_decimal as varchar)")
       .addExpression("cast_long_decimal", "cast (long_decimal as varchar)")
+      .addExpression(
+          "cast_large_real_to_scientific_notation",
+          "cast(large_real as varchar)")
+      .addExpression(
+          "cast_small_real_to_standard_notation", "cast(small_real as varchar)")
+      .addExpression(
+          "cast_small_double_to_scientific_notation",
+          "cast(small_double as varchar)")
+      .addExpression(
+          "cast_large_double_to_standard_notation",
+          "cast(large_double as varchar)")
       .addExpression("cast_timestamp", "cast (timestamp as varchar)")
       .withIterations(100)
       .disableTesting();

diff --git a/velox/docs/functions/presto/conversion.rst b/velox/docs/functions/presto/conversion.rst
@@ -506,6 +506,55 @@ Valid examples
   SELECT cast(cast(1 as DECIMAL(6, 2)) as varchar); -- '1.00'
   SELECT cast(cast(0 as DECIMAL(6, 2)) as varchar); -- '0.00'
 
+From Floating-Point Types
+^^^^^^^^^^^^^^^^^^^^^^^^^
+By default, casting a real or double to string returns standard notation if the magnitude of input value is greater than
+or equal to 10 :superscript:`-3` but less than 10 :superscript:`7`, and returns scientific notation otherwise.
+
+Positive zero returns '0.0' and negative zero returns '-0.0'. Positive infinity returns 'Infinity' and negative infinity
+returns '-Infinity'. Positive and negative NaN returns 'NaN'.
+
+If legacy_cast configuration property is true, the result is standard notation for all input value.
+
+Valid examples if legacy_cast = false,
+
+::
+
+  SELECT cast(double '123456789.01234567' as varchar); -- '1.2345678901234567E8'
+  SELECT cast(double '10000000.0' as varchar); -- '1.0E7'
+  SELECT cast(double '12345.0' as varchar); -- '12345.0'
+  SELECT cast(double '-0.001' as varchar); -- '-0.001'
+  SELECT cast(double '-0.00012' as varchar); -- '-1.2E-4'
+  SELECT cast(double '0.0' as varchar); -- '0.0'
+  SELECT cast(double '-0.0' as varchar); -- '-0.0'
+  SELECT cast(infinity() as varchar); -- 'Infinity'
+  SELECT cast(-infinity() as varchar); -- '-Infinity'
+  SELECT cast(nan() as varchar); -- 'NaN'
+  SELECT cast(-nan() as varchar); -- 'NaN'
+
+  SELECT cast(real '123456780.0' as varchar); -- '1.2345678E8'
+  SELECT cast(real '10000000.0' as varchar); -- '1.0E7'
+  SELECT cast(real '12345.0' as varchar); -- '12345.0'
+  SELECT cast(real '-0.001' as varchar); -- '-0.001'
+  SELECT cast(real '-0.00012' as varchar); -- '-1.2E-4'
+  SELECT cast(real '0.0' as varchar); -- '0.0'
+  SELECT cast(real '-0.0' as varchar); -- '-0.0'
+
+Valid examples if legacy_cast = true,
+
+::
+
+  SELECT cast(double '123456789.01234567' as varchar); -- '123456789.01234567'
+  SELECT cast(double '10000000.0' as varchar); -- '10000000.0'
+  SELECT cast(double '-0.001' as varchar); -- '-0.001'
+  SELECT cast(double '-0.00012' as varchar); -- '-0.00012'
+
+  SELECT cast(real '123456780.0' as varchar); -- '123456784.0'
+  SELECT cast(real '10000000.0' as varchar); -- '10000000.0'
+  SELECT cast(real '12345.0' as varchar); -- '12345.0'
+  SELECT cast(real '-0.00012' as varchar); -- '-0.00011999999696854502'
+
+
 From TIMESTAMP
 ^^^^^^^^^^^^^^
 

diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp
@@ -551,6 +551,180 @@ TEST_F(CastExprTest, basics) {
       {"1.888", "2.5", "3.6", "100.44", "-100.101", "1", "-2"});
 }
 
+TEST_F(CastExprTest, realAndDoubleToString) {
+  setLegacyCast(false);
+  testCast<double, std::string>(
+      "string",
+      {
+          12345678901234567000.0,
+          123456789.01234567,
+          10'000'000.0,
+          12345.0,
+          0.001,
+          0.00012,
+          0.0,
+          -0.0,
+          -0.00012,
+          -0.001,
+          -12345.0,
+          -10'000'000.0,
+          -123456789.01234567,
+          -12345678901234567000.0,
+          std::numeric_limits<double>::infinity(),
+          -std::numeric_limits<double>::infinity(),
+          std::numeric_limits<double>::quiet_NaN(),
+          -std::numeric_limits<double>::quiet_NaN(),
+      },
+      {
+          "1.2345678901234567E19",
+          "1.2345678901234567E8",
+          "1.0E7",
+          "12345.0",
+          "0.001",
+          "1.2E-4",
+          "0.0",
+          "-0.0",
+          "-1.2E-4",
+          "-0.001",
+          "-12345.0",
+          "-1.0E7",
+          "-1.2345678901234567E8",
+          "-1.2345678901234567E19",
+          "Infinity",
+          "-Infinity",
+          "NaN",
+          "NaN",
+      });
+  testCast<float, std::string>(
+      "string",
+      {
+          12345678000000000000.0,
+          123456780.0,
+          10'000'000.0,
+          12345.0,
+          0.001,
+          0.00012,
+          0.0,
+          -0.0,
+          -0.00012,
+          -0.001,
+          -12345.0,
+          -10'000'000.0,
+          -123456780.0,
+          -12345678000000000000.0,
+          std::numeric_limits<float>::infinity(),
+          -std::numeric_limits<float>::infinity(),
+          std::numeric_limits<float>::quiet_NaN(),
+          -std::numeric_limits<float>::quiet_NaN(),
+      },
+      {
+          "1.2345678E19",
+          "1.2345678E8",
+          "1.0E7",
+          "12345.0",
+          "0.001",
+          "1.2E-4",
+          "0.0",
+          "-0.0",
+          "-1.2E-4",
+          "-0.001",
+          "-12345.0",
+          "-1.0E7",
+          "-1.2345678E8",
+          "-1.2345678E19",
+          "Infinity",
+          "-Infinity",
+          "NaN",
+          "NaN",
+      });
+
+  setLegacyCast(true);
+  testCast<double, std::string>(
+      "string",
+      {
+          12345678901234567000.0,
+          123456789.01234567,
+          10'000'000.0,
+          12345.0,
+          0.001,
+          0.00012,
+          0.0,
+          -0.0,
+          -0.00012,
+          -0.001,
+          -12345.0,
+          -10'000'000.0,
+          -123456789.01234567,
+          -12345678901234567000.0,
+          std::numeric_limits<double>::infinity(),
+          -std::numeric_limits<double>::infinity(),
+          std::numeric_limits<double>::quiet_NaN(),
+          -std::numeric_limits<double>::quiet_NaN(),
+      },
+      {
+          "12345678901234567000.0",
+          "123456789.01234567",
+          "10000000.0",
+          "12345.0",
+          "0.001",
+          "0.00012",
+          "0.0",
+          "-0.0",
+          "-0.00012",
+          "-0.001",
+          "-12345.0",
+          "-10000000.0",
+          "-123456789.01234567",
+          "-12345678901234567000.0",
+          "Infinity",
+          "-Infinity",
+          "NaN",
+          "NaN",
+      });
+  testCast<float, std::string>(
+      "string",
+      {
+          12345678000000000000.0,
+          123456780.0,
+          10'000'000.0,
+          12345.0,
+          0.001,
+          0.00012,
+          0.0,
+          -0.0,
+          -0.00012,
+          -0.001,
+          -12345.0,
+          -10'000'000.0,
+          -123456780.0,
+          -12345678000000000000.0,
+          std::numeric_limits<float>::infinity(),
+          -std::numeric_limits<float>::infinity(),
+          std::numeric_limits<float>::quiet_NaN(),
+          -std::numeric_limits<float>::quiet_NaN(),
+      },
+      {
+          "12345678295994466000.0",
+          "123456784.0",
+          "10000000.0",
+          "12345.0",
+          "0.0010000000474974513",
+          "0.00011999999696854502",
+          "0.0",
+          "-0.0",
+          "-0.00011999999696854502",
+          "-0.0010000000474974513",
+          "-12345.0",
+          "-10000000.0",
+          "-123456784.0",
+          "-12345678295994466000.0",
+          "Infinity",
+          "-Infinity",
+          "NaN",
+          "NaN",
+      });
+}
+
 TEST_F(CastExprTest, stringToTimestamp) {
   std::vector<std::optional<std::string>> input{
       "1970-01-01",

diff --git a/velox/type/Conversions.h b/velox/type/Conversions.h
@@ -394,14 +394,29 @@ struct Converter<TypeKind::VARCHAR, void, TRUNCATE, LEGACY_CAST> {
   template <typename T>
   static std::string cast(const T& val) {
     if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
-      auto stringValue = folly::to<std::string>(val);
-      if (!FLAGS_experimental_enable_legacy_cast &&
-          stringValue.find(".") == std::string::npos &&
-          isdigit(stringValue[stringValue.length() - 1])) {
-        stringValue += ".0";
+      if constexpr (LEGACY_CAST) {
+        auto str = folly::to<std::string>(val);
+        normalizeStandardNotation(str);
+        return str;
       }
-      return stringValue;
+
+      if (FOLLY_UNLIKELY(std::isinf(val) || std::isnan(val))) {
+        return folly::to<std::string>(val);
+      }
+      if ((val > -10'000'000 && val <= -0.001) ||
+          (val >= 0.001 && val < 10'000'000) || val == 0.0) {
+        auto str = fmt::format("{}", val);
+        normalizeStandardNotation(str);
+        return str;
+      }
+      // Precision of float is at most 8 significant decimal digits. Precision
+      // of double is at most 17 significant decimal digits.
+      auto str =
+          fmt::format(std::is_same_v<T, float> ? "{:.7E}" : "{:.16E}", val);
+      normalizeScientificNotation(str);
+      return str;
     }
+
     return folly::to<std::string>(val);
   }
 
@@ -418,6 +433,57 @@ struct Converter<TypeKind::VARCHAR, void, TRUNCATE, LEGACY_CAST> {
   static std::string cast(const bool& val) {
     return val ? "true" : "false";
   }
+
+  /// Normalize the given floating-point standard notation string in place, by
+  /// appending '.0' if it has only the integer part but no fractional part. For
+  /// example, for the given string '12345', replace it with '12345.0'.
+  static void normalizeStandardNotation(std::string& str) {
+    if (!FLAGS_experimental_enable_legacy_cast &&
+        str.find(".") == std::string::npos && isdigit(str[str.length() - 1])) {
+      str += ".0";
+    }
+  }
+
+  /// Normalize the given floating-point scientific notation string in place, by
+  /// removing the trailing 0s of the coefficient as well as the leading '+' and
+  /// 0s of the exponent. For example, for the given string '3.0000000E+005',
+  /// replace it with '3.0E5'. For '-1.2340000E-010', replace it with
+  /// '-1.234E-10'.
+  static void normalizeScientificNotation(std::string& str) {
+    size_t idxE = str.find('E');
+    VELOX_DCHECK_NE(
+        idxE,
+        std::string::npos,
+        "Expect a character 'E' in scientific notation.");
+
+    int endCoef = idxE - 1;
+    while (endCoef >= 0 && str[endCoef] == '0') {
+      --endCoef;
+    }
+    VELOX_DCHECK_GT(endCoef, 0, "Coefficient should not be all zeros.");
+
+    int pos = endCoef + 1;
+    if (str[endCoef] == '.') {
+      pos++;
+    }
+    str[pos++] = 'E';
+
+    int startExp = idxE + 1;
+    if (str[startExp] == '-') {
+      str[pos++] = '-';
+      startExp++;
+    }
+    while (startExp < str.length() &&
+           (str[startExp] == '0' || str[startExp] == '+')) {
+      startExp++;
+    }
+    VELOX_DCHECK_LT(
+        startExp, str.length(), "Exponent should not be all zeros.");
+    str.replace(pos, str.length() - startExp, str, startExp);
+    pos += str.length() - startExp;
+
+    str.resize(pos);
+  }
 };
 
 // Allow conversions from string to TIMESTAMP type.