Skip to content

Commit

Permalink
Change cast(double|real as varchar) to return scientific notation (#7602
Browse files Browse the repository at this point in the history
)

Summary:
Change cast(double as varchar) and cast(real as varchar) to return scientific
notation, when magnitude of the input value is greater than or equal to 10^7,
or less than 10^-3. Today they return standard notation in all cases.

Change to use fmt::format() instead of folly::to() to output standard notation,
for less noisy digits in outputs, when magnitude of the input value is greater
than or equal to 10^-3 and less than 10^7.

Use QueryConfig legacy_cast false to gate this new
behavior. To keep existing behavior, set legacy_cast to true.

The returned scientific notation uses a max of 17 significant decimal digits
for double and a max of 8 decimal digits for real, according to the double and
single precision supported by IEEE 754 floating point, as well as Presto
empirically.

Also add 4 benchmarks for casting double or real to scientific notation or standard
notation. The time of casting for legacy_cast = false, new implementation is
on par with the time for legacy_cast = true, today implementation.

Pull Request resolved: #7602

Reviewed By: kagamiori

Differential Revision: D51386874

Pulled By: gggrace14

fbshipit-source-id: 84286074864b8a817d0f3feb97d498b4fe7c6c66
  • Loading branch information
gggrace14 authored and facebook-github-bot committed Dec 5, 2023
1 parent a5d816e commit 14dc44b
Show file tree
Hide file tree
Showing 4 changed files with 322 additions and 6 deletions.
27 changes: 27 additions & 0 deletions velox/benchmarks/basic/CastBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ int main(int argc, char** argv) {
},
nullptr,
DECIMAL(38, 16));
auto largeRealInput = vectorMaker.flatVector<float>(
vectorSize, [&](auto j) { return 12345678.0 * j; });
auto smallRealInput = vectorMaker.flatVector<float>(
vectorSize, [&](auto j) { return 1.2345678 * j; });
auto smallDoubleInput = vectorMaker.flatVector<double>(
vectorSize, [&](auto j) { return -0.00012345678 / j; });
auto largeDoubleInput = vectorMaker.flatVector<double>(
vectorSize, [&](auto j) { return -123456.7 / j; });
auto timestampInput =
vectorMaker.flatVector<Timestamp>(vectorSize, [&](auto j) {
return Timestamp(1695859694 + j / 1000, j % 1000 * 1'000'000);
Expand All @@ -71,13 +79,21 @@ int main(int argc, char** argv) {
"decimal",
"short_decimal",
"long_decimal",
"large_real",
"small_real",
"small_double",
"large_double",
"timestamp"},
{validInput,
invalidInput,
nanInput,
decimalInput,
shortDecimalInput,
longDecimalInput,
largeRealInput,
smallRealInput,
smallDoubleInput,
largeDoubleInput,
timestampInput}))
.addExpression("try_cast_invalid_empty_input", "try_cast (empty as int) ")
.addExpression(
Expand All @@ -91,6 +107,17 @@ int main(int argc, char** argv) {
"cast_decimal_to_inline_string", "cast (decimal as varchar)")
.addExpression("cast_short_decimal", "cast (short_decimal as varchar)")
.addExpression("cast_long_decimal", "cast (long_decimal as varchar)")
.addExpression(
"cast_large_real_to_scientific_notation",
"cast(large_real as varchar)")
.addExpression(
"cast_small_real_to_standard_notation", "cast(small_real as varchar)")
.addExpression(
"cast_small_double_to_scientific_notation",
"cast(small_double as varchar)")
.addExpression(
"cast_large_double_to_standard_notation",
"cast(large_double as varchar)")
.addExpression("cast_timestamp", "cast (timestamp as varchar)")
.withIterations(100)
.disableTesting();
Expand Down
49 changes: 49 additions & 0 deletions velox/docs/functions/presto/conversion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,55 @@ Valid examples
SELECT cast(cast(1 as DECIMAL(6, 2)) as varchar); -- '1.00'
SELECT cast(cast(0 as DECIMAL(6, 2)) as varchar); -- '0.00'

From Floating-Point Types
^^^^^^^^^^^^^^^^^^^^^^^^^
By default, casting a real or double to string returns standard notation if the magnitude of input value is greater than
or equal to 10 :superscript:`-3` but less than 10 :superscript:`7`, and returns scientific notation otherwise.

Positive zero returns '0.0' and negative zero returns '-0.0'. Positive infinity returns 'Infinity' and negative infinity
returns '-Infinity'. Positive and negative NaN returns 'NaN'.

If legacy_cast configuration property is true, the result is standard notation for all input value.

Valid examples if legacy_cast = false,

::

SELECT cast(double '123456789.01234567' as varchar); -- '1.2345678901234567E8'
SELECT cast(double '10000000.0' as varchar); -- '1.0E7'
SELECT cast(double '12345.0' as varchar); -- '12345.0'
SELECT cast(double '-0.001' as varchar); -- '-0.001'
SELECT cast(double '-0.00012' as varchar); -- '-1.2E-4'
SELECT cast(double '0.0' as varchar); -- '0.0'
SELECT cast(double '-0.0' as varchar); -- '-0.0'
SELECT cast(infinity() as varchar); -- 'Infinity'
SELECT cast(-infinity() as varchar); -- '-Infinity'
SELECT cast(nan() as varchar); -- 'NaN'
SELECT cast(-nan() as varchar); -- 'NaN'

SELECT cast(real '123456780.0' as varchar); -- '1.2345678E8'
SELECT cast(real '10000000.0' as varchar); -- '1.0E7'
SELECT cast(real '12345.0' as varchar); -- '12345.0'
SELECT cast(real '-0.001' as varchar); -- '-0.001'
SELECT cast(real '-0.00012' as varchar); -- '-1.2E-4'
SELECT cast(real '0.0' as varchar); -- '0.0'
SELECT cast(real '-0.0' as varchar); -- '-0.0'

Valid examples if legacy_cast = true,

::

SELECT cast(double '123456789.01234567' as varchar); -- '123456789.01234567'
SELECT cast(double '10000000.0' as varchar); -- '10000000.0'
SELECT cast(double '-0.001' as varchar); -- '-0.001'
SELECT cast(double '-0.00012' as varchar); -- '-0.00012'

SELECT cast(real '123456780.0' as varchar); -- '123456784.0'
SELECT cast(real '10000000.0' as varchar); -- '10000000.0'
SELECT cast(real '12345.0' as varchar); -- '12345.0'
SELECT cast(real '-0.00012' as varchar); -- '-0.00011999999696854502'


From TIMESTAMP
^^^^^^^^^^^^^^

Expand Down
174 changes: 174 additions & 0 deletions velox/expression/tests/CastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,180 @@ TEST_F(CastExprTest, basics) {
{"1.888", "2.5", "3.6", "100.44", "-100.101", "1", "-2"});
}

TEST_F(CastExprTest, realAndDoubleToString) {
setLegacyCast(false);
testCast<double, std::string>(
"string",
{
12345678901234567000.0,
123456789.01234567,
10'000'000.0,
12345.0,
0.001,
0.00012,
0.0,
-0.0,
-0.00012,
-0.001,
-12345.0,
-10'000'000.0,
-123456789.01234567,
-12345678901234567000.0,
std::numeric_limits<double>::infinity(),
-std::numeric_limits<double>::infinity(),
std::numeric_limits<double>::quiet_NaN(),
-std::numeric_limits<double>::quiet_NaN(),
},
{
"1.2345678901234567E19",
"1.2345678901234567E8",
"1.0E7",
"12345.0",
"0.001",
"1.2E-4",
"0.0",
"-0.0",
"-1.2E-4",
"-0.001",
"-12345.0",
"-1.0E7",
"-1.2345678901234567E8",
"-1.2345678901234567E19",
"Infinity",
"-Infinity",
"NaN",
"NaN",
});
testCast<float, std::string>(
"string",
{
12345678000000000000.0,
123456780.0,
10'000'000.0,
12345.0,
0.001,
0.00012,
0.0,
-0.0,
-0.00012,
-0.001,
-12345.0,
-10'000'000.0,
-123456780.0,
-12345678000000000000.0,
std::numeric_limits<float>::infinity(),
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::quiet_NaN(),
-std::numeric_limits<float>::quiet_NaN(),
},
{
"1.2345678E19",
"1.2345678E8",
"1.0E7",
"12345.0",
"0.001",
"1.2E-4",
"0.0",
"-0.0",
"-1.2E-4",
"-0.001",
"-12345.0",
"-1.0E7",
"-1.2345678E8",
"-1.2345678E19",
"Infinity",
"-Infinity",
"NaN",
"NaN",
});

setLegacyCast(true);
testCast<double, std::string>(
"string",
{
12345678901234567000.0,
123456789.01234567,
10'000'000.0,
12345.0,
0.001,
0.00012,
0.0,
-0.0,
-0.00012,
-0.001,
-12345.0,
-10'000'000.0,
-123456789.01234567,
-12345678901234567000.0,
std::numeric_limits<double>::infinity(),
-std::numeric_limits<double>::infinity(),
std::numeric_limits<double>::quiet_NaN(),
-std::numeric_limits<double>::quiet_NaN(),
},
{
"12345678901234567000.0",
"123456789.01234567",
"10000000.0",
"12345.0",
"0.001",
"0.00012",
"0.0",
"-0.0",
"-0.00012",
"-0.001",
"-12345.0",
"-10000000.0",
"-123456789.01234567",
"-12345678901234567000.0",
"Infinity",
"-Infinity",
"NaN",
"NaN",
});
testCast<float, std::string>(
"string",
{
12345678000000000000.0,
123456780.0,
10'000'000.0,
12345.0,
0.001,
0.00012,
0.0,
-0.0,
-0.00012,
-0.001,
-12345.0,
-10'000'000.0,
-123456780.0,
-12345678000000000000.0,
std::numeric_limits<float>::infinity(),
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::quiet_NaN(),
-std::numeric_limits<float>::quiet_NaN(),
},
{
"12345678295994466000.0",
"123456784.0",
"10000000.0",
"12345.0",
"0.0010000000474974513",
"0.00011999999696854502",
"0.0",
"-0.0",
"-0.00011999999696854502",
"-0.0010000000474974513",
"-12345.0",
"-10000000.0",
"-123456784.0",
"-12345678295994466000.0",
"Infinity",
"-Infinity",
"NaN",
"NaN",
});
}

TEST_F(CastExprTest, stringToTimestamp) {
std::vector<std::optional<std::string>> input{
"1970-01-01",
Expand Down
78 changes: 72 additions & 6 deletions velox/type/Conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,14 +394,29 @@ struct Converter<TypeKind::VARCHAR, void, TRUNCATE, LEGACY_CAST> {
template <typename T>
static std::string cast(const T& val) {
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
auto stringValue = folly::to<std::string>(val);
if (!FLAGS_experimental_enable_legacy_cast &&
stringValue.find(".") == std::string::npos &&
isdigit(stringValue[stringValue.length() - 1])) {
stringValue += ".0";
if constexpr (LEGACY_CAST) {
auto str = folly::to<std::string>(val);
normalizeStandardNotation(str);
return str;
}
return stringValue;

if (FOLLY_UNLIKELY(std::isinf(val) || std::isnan(val))) {
return folly::to<std::string>(val);
}
if ((val > -10'000'000 && val <= -0.001) ||
(val >= 0.001 && val < 10'000'000) || val == 0.0) {
auto str = fmt::format("{}", val);
normalizeStandardNotation(str);
return str;
}
// Precision of float is at most 8 significant decimal digits. Precision
// of double is at most 17 significant decimal digits.
auto str =
fmt::format(std::is_same_v<T, float> ? "{:.7E}" : "{:.16E}", val);
normalizeScientificNotation(str);
return str;
}

return folly::to<std::string>(val);
}

Expand All @@ -418,6 +433,57 @@ struct Converter<TypeKind::VARCHAR, void, TRUNCATE, LEGACY_CAST> {
static std::string cast(const bool& val) {
return val ? "true" : "false";
}

/// Normalize the given floating-point standard notation string in place, by
/// appending '.0' if it has only the integer part but no fractional part. For
/// example, for the given string '12345', replace it with '12345.0'.
static void normalizeStandardNotation(std::string& str) {
if (!FLAGS_experimental_enable_legacy_cast &&
str.find(".") == std::string::npos && isdigit(str[str.length() - 1])) {
str += ".0";
}
}

/// Normalize the given floating-point scientific notation string in place, by
/// removing the trailing 0s of the coefficient as well as the leading '+' and
/// 0s of the exponent. For example, for the given string '3.0000000E+005',
/// replace it with '3.0E5'. For '-1.2340000E-010', replace it with
/// '-1.234E-10'.
static void normalizeScientificNotation(std::string& str) {
size_t idxE = str.find('E');
VELOX_DCHECK_NE(
idxE,
std::string::npos,
"Expect a character 'E' in scientific notation.");

int endCoef = idxE - 1;
while (endCoef >= 0 && str[endCoef] == '0') {
--endCoef;
}
VELOX_DCHECK_GT(endCoef, 0, "Coefficient should not be all zeros.");

int pos = endCoef + 1;
if (str[endCoef] == '.') {
pos++;
}
str[pos++] = 'E';

int startExp = idxE + 1;
if (str[startExp] == '-') {
str[pos++] = '-';
startExp++;
}
while (startExp < str.length() &&
(str[startExp] == '0' || str[startExp] == '+')) {
startExp++;
}
VELOX_DCHECK_LT(
startExp, str.length(), "Exponent should not be all zeros.");
str.replace(pos, str.length() - startExp, str, startExp);
pos += str.length() - startExp;

str.resize(pos);
}
};

// Allow conversions from string to TIMESTAMP type.
Expand Down

0 comments on commit 14dc44b

Please sign in to comment.