Skip to content

Commit

Permalink
Optimize DateType::toString(int32_t) and functions::getDateTime (#6913)
Browse files Browse the repository at this point in the history
Summary:

Optimize `DateType::toString(int32_t)` and `functions::getDateTime` by reusing the same optimization we did for `Timestamp::toString`.

Fix #6918

Reviewed By: oerling

Differential Revision: D49940219
  • Loading branch information
Yuhta authored and facebook-github-bot committed Oct 5, 2023
1 parent 1c856bc commit 584c3ea
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 60 deletions.
10 changes: 4 additions & 6 deletions velox/functions/lib/TimeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ FOLLY_ALWAYS_INLINE
std::tm getDateTime(Timestamp timestamp, const date::time_zone* timeZone) {
int64_t seconds = getSeconds(timestamp, timeZone);
std::tm dateTime;
VELOX_USER_CHECK_NOT_NULL(
gmtime_r((const time_t*)&seconds, &dateTime),
VELOX_USER_CHECK(
epochToUtc(seconds, dateTime),
"Timestamp is too large: {} seconds since epoch",
seconds);
return dateTime;
Expand All @@ -63,10 +63,8 @@ FOLLY_ALWAYS_INLINE
std::tm getDateTime(int32_t days) {
int64_t seconds = days * kSecondsInDay;
std::tm dateTime;
VELOX_USER_CHECK_NOT_NULL(
gmtime_r((const time_t*)&seconds, &dateTime),
"Date is too large: {} days",
days);
VELOX_USER_CHECK(
epochToUtc(seconds, dateTime), "Date is too large: {} days", days);
return dateTime;
}

Expand Down
90 changes: 50 additions & 40 deletions velox/type/Timestamp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,14 +121,15 @@ void Timestamp::toTimezone(int16_t tzID) {
namespace {

constexpr int kTmYearBase = 1900;
constexpr int64_t kLeapYearOffset = 4000000000ll;

inline bool isLeap(int64_t y) {
return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0);
}

inline int64_t leapThroughEndOf(int64_t y) {
// Add a large offset to make the calculation for negative years correct.
y += 400000000;
y += kLeapYearOffset;
VELOX_DCHECK_GE(y, 0);
return y / 4 - y / 100 + y / 400;
}
Expand All @@ -138,15 +139,31 @@ const int monthLengths[][12] = {
{31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
};

// Our own version of gmtime_r to avoid expensive calls to __tz_convert. This
// might not be very significant in micro benchmark, but is causing significant
// context switching cost in real world queries with higher concurrency (71% of
// time is on __tz_convert for some queries).
std::tm toUtc(const time_t& epoch) {
// clang-format off
const char intToStr[][3] = {
"00", "01", "02", "03", "04", "05", "06", "07", "08", "09",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
"30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
"40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
"50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
"60", "61",
};
// clang-format on

void appendSmallInt(int n, std::string& out) {
VELOX_DCHECK_LE(n, 61);
const char* s = intToStr[n];
out += s[0];
out += s[1];
}

} // namespace

bool epochToUtc(int64_t epoch, std::tm& tm) {
constexpr int kSecondsPerHour = 3600;
constexpr int kSecondsPerDay = 24 * kSecondsPerHour;
constexpr int kDaysPerYear = 365;
std::tm tm;
int64_t days = epoch / kSecondsPerDay;
int64_t rem = epoch % kSecondsPerDay;
while (rem < 0) {
Expand All @@ -162,54 +179,41 @@ std::tm toUtc(const time_t& epoch) {
tm.tm_wday += 7;
}
int64_t y = 1970;
if (y + days / kDaysPerYear <= -kLeapYearOffset + 10) {
return false;
}
bool leapYear;
while (days < 0 || days >= kDaysPerYear + (leapYear = isLeap(y))) {
auto newy = y + days / kDaysPerYear - (days < 0);
days -= (newy - y) * kDaysPerYear + leapThroughEndOf(newy - 1) -
leapThroughEndOf(y - 1);
y = newy;
}
tm.tm_year = y - kTmYearBase;
y -= kTmYearBase;
if (y > std::numeric_limits<decltype(tm.tm_year)>::max() ||
y < std::numeric_limits<decltype(tm.tm_year)>::min()) {
return false;
}
tm.tm_year = y;
tm.tm_yday = days;
auto* ip = monthLengths[leapYear];
for (tm.tm_mon = 0; days >= ip[tm.tm_mon]; ++tm.tm_mon) {
days = days - ip[tm.tm_mon];
}
tm.tm_mday = days + 1;
tm.tm_isdst = 0;
return tm;
return true;
}

// clang-format off
const char intToStr[][3] = {
"00", "01", "02", "03", "04", "05", "06", "07", "08", "09",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
"30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
"40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
"50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
"60", "61",
};
// clang-format on

void appendSmallInt(int n, std::string& out) {
VELOX_DCHECK_LE(n, 61);
const char* s = intToStr[n];
out += s[0];
out += s[1];
}

} // namespace

std::string Timestamp::toString(const TimestampToStringOptions& options) const {
auto tmValue = toUtc(seconds_);
std::string tmToString(
const std::tm& tmValue,
int nanos,
const TimestampToStringOptions& options) {
VELOX_DCHECK_GE(nanos, 0);
VELOX_DCHECK_LT(nanos, 1'000'000'000);
int width = options.precision;
auto value = nanos_;
if (options.precision == TimestampToStringOptions::kMilliseconds) {
value /= 1'000'000;
}
std::string out;
out.reserve(26 + width);
out.reserve(options.dateOnly ? 10 : 26 + width);
int n = kTmYearBase + tmValue.tm_year;
bool negative = n < 0;
if (negative) {
Expand All @@ -230,6 +234,9 @@ std::string Timestamp::toString(const TimestampToStringOptions& options) const {
appendSmallInt(1 + tmValue.tm_mon, out);
out += '-';
appendSmallInt(tmValue.tm_mday, out);
if (options.dateOnly) {
return out;
}
out += options.dateTimeSeparator;
appendSmallInt(tmValue.tm_hour, out);
out += ':';
Expand All @@ -238,9 +245,12 @@ std::string Timestamp::toString(const TimestampToStringOptions& options) const {
appendSmallInt(tmValue.tm_sec, out);
out += '.';
int offset = out.size();
while (value > 0) {
out += '0' + value % 10;
value /= 10;
if (options.precision == TimestampToStringOptions::kMilliseconds) {
nanos /= 1'000'000;
}
while (nanos > 0) {
out += '0' + nanos % 10;
nanos /= 10;
}
while (out.size() - offset < width) {
out += '0';
Expand Down
23 changes: 21 additions & 2 deletions velox/type/Timestamp.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,27 @@ class time_zone;
namespace facebook::velox {

struct TimestampToStringOptions {
enum Precision : int {
enum Precision : int8_t {
kMilliseconds = 3,
kNanoseconds = 9,
} precision = kNanoseconds;

bool zeroPaddingYear = false;
char dateTimeSeparator = 'T';
bool dateOnly = false;
};

// Our own version of gmtime_r to avoid expensive calls to __tz_convert. This
// might not be very significant in micro benchmark, but is causing significant
// context switching cost in real world queries with higher concurrency (71% of
// time is on __tz_convert for some queries).
//
// Return whether the epoch second can be converted to a valid std::tm.
bool epochToUtc(int64_t seconds, std::tm& out);

std::string
tmToString(const std::tm&, int nanos, const TimestampToStringOptions&);

struct Timestamp {
public:
static constexpr int64_t kMillisecondsInSecond = 1'000;
Expand Down Expand Up @@ -273,7 +285,14 @@ struct Timestamp {
return StringView("TODO: Implement");
};

std::string toString(const TimestampToStringOptions& = {}) const;
std::string toString(const TimestampToStringOptions& options = {}) const {
std::tm tm;
VELOX_USER_CHECK(
epochToUtc(seconds_, tm),
"Can't convert seconds to time: {}",
seconds_);
return tmToString(tm, nanos_, options);
}

operator std::string() const {
return toString();
Expand Down
17 changes: 5 additions & 12 deletions velox/type/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,19 +972,12 @@ std::string DateType::toString(int32_t days) const {
// Find the number of seconds for the days_;
// Casting 86400 to int64 to handle overflows gracefully.
int64_t daySeconds = days * (int64_t)(86400);

// gmtime is not thread-safe. Make sure to use gmtime_r.
std::tm tmValue;
VELOX_CHECK_NOT_NULL(
gmtime_r((const time_t*)&daySeconds, &tmValue),
"Can't convert days to dates: {}",
days);

// return ISO 8601 time format.
// %F - equivalent to "%Y-%m-%d" (the ISO 8601 date format)
std::ostringstream oss;
oss << std::put_time(&tmValue, "%F");
return oss.str();
VELOX_CHECK(
epochToUtc(daySeconds, tmValue), "Can't convert days to dates: {}", days);
TimestampToStringOptions options;
options.dateOnly = true;
return tmToString(tmValue, 0, options);
}

int32_t DateType::toDays(folly::StringPiece in) const {
Expand Down
33 changes: 33 additions & 0 deletions velox/type/tests/TimestampTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,5 +298,38 @@ TEST(TimestampTest, outOfRange) {
VELOX_ASSERT_THROW(
t.toTimezone(*timezone), "Timestamp is outside of supported range");
}

TEST(TimestampTest, epochToUtc) {
std::tm tm;
ASSERT_FALSE(epochToUtc(-(1ll << 60), tm));
ASSERT_FALSE(epochToUtc(1ll << 60, tm));
}

TEST(TimestampTest, randomEpochToUtc) {
uint64_t seed = 42;
// seed = std::random_device{}();
std::default_random_engine gen(seed);
std::uniform_int_distribution<int64_t> dist(
std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
std::tm actual, expected;
for (int i = 0; i < 10000; ++i) {
auto epoch = dist(gen);
SCOPED_TRACE(fmt::format("epoch={}", epoch));
if (gmtime_r(&epoch, &expected)) {
ASSERT_TRUE(epochToUtc(epoch, actual));
ASSERT_EQ(expected.tm_year, actual.tm_year);
ASSERT_EQ(expected.tm_yday, actual.tm_yday);
ASSERT_EQ(expected.tm_mon, actual.tm_mon);
ASSERT_EQ(expected.tm_mday, actual.tm_mday);
ASSERT_EQ(expected.tm_wday, actual.tm_wday);
ASSERT_EQ(expected.tm_hour, actual.tm_hour);
ASSERT_EQ(expected.tm_min, actual.tm_min);
ASSERT_EQ(expected.tm_sec, actual.tm_sec);
} else {
ASSERT_FALSE(epochToUtc(epoch, actual));
}
}
}

} // namespace
} // namespace facebook::velox

0 comments on commit 584c3ea

Please sign in to comment.