Skip to content

Commit

Permalink
Optimize date_trunc (facebookincubator#10065)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#10065

Replace the slow `timegm` function with our own implementation.  This improves some queries by more than 30 times in CPU time (from 140.78 days to 4.46 days, vs Java 14.00 days).

Differential Revision: D58191555
  • Loading branch information
Yuhta authored and facebook-github-bot committed Jun 6, 2024
1 parent 7164f92 commit 1c4b9f4
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 18 deletions.
12 changes: 9 additions & 3 deletions velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,9 @@ struct DateTruncFunction : public TimestampWithTimezoneSupport<T> {
auto dateTime = getDateTime(timestamp, timeZone_);
adjustDateTime(dateTime, unit);

result = Timestamp(timegm(&dateTime), 0);
int64_t seconds;
VELOX_CHECK(Timestamp::utcToEpoch(dateTime, seconds));
result = Timestamp(seconds, 0);
if (timeZone_ != nullptr) {
result.toGMT(*timeZone_);
}
Expand All @@ -945,7 +947,9 @@ struct DateTruncFunction : public TimestampWithTimezoneSupport<T> {
auto dateTime = getDateTime(date);
adjustDateTime(dateTime, unit);

result = timegm(&dateTime) / kSecondsInDay;
int64_t seconds;
VELOX_CHECK(Timestamp::utcToEpoch(dateTime, seconds));
result = seconds / kSecondsInDay;
}

FOLLY_ALWAYS_INLINE void call(
Expand All @@ -970,7 +974,9 @@ struct DateTruncFunction : public TimestampWithTimezoneSupport<T> {
auto timestamp = this->toTimestamp(timestampWithTimezone);
auto dateTime = getDateTime(timestamp, nullptr);
adjustDateTime(dateTime, unit);
timestamp = Timestamp::fromMillis(timegm(&dateTime) * 1000);
int64_t seconds;
VELOX_CHECK(Timestamp::utcToEpoch(dateTime, seconds));
timestamp = Timestamp::fromMillis(seconds * 1000);
timestamp.toGMT(unpackZoneKeyId(timestampWithTimezone));

result = pack(timestamp.toMillis(), unpackZoneKeyId(timestampWithTimezone));
Expand Down
50 changes: 38 additions & 12 deletions velox/type/Timestamp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ namespace {

constexpr int kTmYearBase = 1900;
constexpr int64_t kLeapYearOffset = 4000000000ll;
constexpr int64_t kSecondsPerHour = 3600;
constexpr int64_t kSecondsPerDay = 24 * kSecondsPerHour;

inline bool isLeap(int64_t y) {
return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0);
Expand All @@ -166,9 +168,13 @@ inline int64_t leapThroughEndOf(int64_t y) {
return y / 4 - y / 100 + y / 400;
}

const int monthLengths[][12] = {
{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
{31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
inline int64_t daysBetweenYears(int64_t y1, int64_t y2) {
return 365 * (y2 - y1) + leapThroughEndOf(y2 - 1) - leapThroughEndOf(y1 - 1);
}

const int16_t daysBeforeFirstDayOfMonth[][12] = {
{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334},
{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335},
};

// clang-format off
Expand All @@ -187,11 +193,10 @@ void appendSmallInt(int n, std::string& out) {
VELOX_DCHECK_LE(n, 61);
out.append(intToStr[n], 2);
}

} // namespace

bool Timestamp::epochToUtc(int64_t epoch, std::tm& tm) {
constexpr int kSecondsPerHour = 3600;
constexpr int kSecondsPerDay = 24 * kSecondsPerHour;
constexpr int kDaysPerYear = 365;
int64_t days = epoch / kSecondsPerDay;
int64_t rem = epoch % kSecondsPerDay;
Expand All @@ -214,8 +219,7 @@ bool Timestamp::epochToUtc(int64_t epoch, std::tm& tm) {
bool leapYear;
while (days < 0 || days >= kDaysPerYear + (leapYear = isLeap(y))) {
auto newy = y + days / kDaysPerYear - (days < 0);
days -= (newy - y) * kDaysPerYear + leapThroughEndOf(newy - 1) -
leapThroughEndOf(y - 1);
days -= daysBetweenYears(y, newy);
y = newy;
}
y -= kTmYearBase;
Expand All @@ -225,15 +229,37 @@ bool Timestamp::epochToUtc(int64_t epoch, std::tm& tm) {
}
tm.tm_year = y;
tm.tm_yday = days;
auto* ip = monthLengths[leapYear];
for (tm.tm_mon = 0; days >= ip[tm.tm_mon]; ++tm.tm_mon) {
days = days - ip[tm.tm_mon];
}
tm.tm_mday = days + 1;
auto* months = daysBeforeFirstDayOfMonth[leapYear];
tm.tm_mon = std::upper_bound(months, months + 12, days) - months - 1;
tm.tm_mday = days - months[tm.tm_mon] + 1;
tm.tm_isdst = 0;
return true;
}

// static
bool Timestamp::utcToEpoch(const std::tm& tm, int64_t& seconds) {
static_assert(sizeof(decltype(tm.tm_year)) == 4);
// tm_year stores number of years since 1900.
int64_t year = tm.tm_year + 1900ll;
int64_t month = tm.tm_mon;
if (FOLLY_UNLIKELY(month > 11)) {
year += month / 12;
month %= 12;
} else if (FOLLY_UNLIKELY(month < 0)) {
auto yearsDiff = (-month + 11) / 12;
year -= yearsDiff;
month += 12 * yearsDiff;
}
// Getting number of days since beginning of the year.
auto dayOfYear =
-1ll + daysBeforeFirstDayOfMonth[isLeap(year)][month] + tm.tm_mday;
// Number of days since 1970-01-01.
auto daysSinceEpoch = daysBetweenYears(1970, year) + dayOfYear;
seconds = kSecondsPerDay * daysSinceEpoch + kSecondsPerHour * tm.tm_hour +
60ll * tm.tm_min + tm.tm_sec;
return true;
}

StringView Timestamp::tmToStringView(
const std::tm& tmValue,
uint64_t nanos,
Expand Down
6 changes: 6 additions & 0 deletions velox/type/Timestamp.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,12 @@ struct Timestamp {
/// Return whether the epoch second can be converted to a valid std::tm.
static bool epochToUtc(int64_t seconds, std::tm& out);

/// Our own version of timegm to avoid expensive calls to __tz_convert.
///
/// Return whether the conversion is successful. This function is guaranteed
/// to be successful when the same input on std::timegm is successful.
static bool utcToEpoch(const std::tm& tm, int64_t& seconds);

/// Converts a std::tm to a time/date/timestamp string in ISO 8601 format
/// according to TimestampToStringOptions.
/// @param startPosition the start position of pre-allocated memory to write
Expand Down
70 changes: 67 additions & 3 deletions velox/type/tests/TimestampTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,17 @@ TEST(TimestampTest, toStringPrestoCastBehavior) {
}

namespace {

uint64_t randomSeed() {
if (const char* env = getenv("VELOX_TEST_USE_RANDOM_SEED")) {
auto seed = std::random_device{}();
LOG(INFO) << "Random seed: " << seed;
return seed;
} else {
return 42;
}
}

std::string toStringAlt(
const Timestamp& t,
TimestampToStringOptions::Precision precision) {
Expand All @@ -249,12 +260,33 @@ std::string toStringAlt(
oss << '.' << std::setfill('0') << std::setw(width) << value;
return oss.str();
}

bool checkUtcToEpoch(int year, int mon, int mday, int hour, int min, int sec) {
SCOPED_TRACE(fmt::format(
"{}-{:02}-{:02} {:02}:{:02}:{:02}", year, mon, mday, hour, min, sec));
std::tm tm{};
tm.tm_sec = sec;
tm.tm_min = min;
tm.tm_hour = hour;
tm.tm_mday = mday;
tm.tm_mon = mon;
tm.tm_year = year;
errno = 0;
auto expected = timegm(&tm);
bool error = expected == -1 && errno != 0;
int64_t actual;
if (!Timestamp::utcToEpoch(tm, actual)) {
EXPECT_TRUE(error);
} else if (!error) {
EXPECT_EQ(actual, expected);
}
return !error;
}

} // namespace

TEST(TimestampTest, compareWithToStringAlt) {
uint64_t seed = 42;
// seed = std::random_device{}();
std::default_random_engine gen(seed);
std::default_random_engine gen(randomSeed());
std::uniform_int_distribution<int64_t> distSec(
Timestamp::kMinSeconds, Timestamp::kMaxSeconds);
std::uniform_int_distribution<uint64_t> distNano(0, Timestamp::kMaxNanos);
Expand All @@ -271,6 +303,38 @@ TEST(TimestampTest, compareWithToStringAlt) {
}
}

TEST(TimestampTest, utcToEpoch) {
ASSERT_TRUE(checkUtcToEpoch(1970, 1, 1, 0, 0, 0));
ASSERT_TRUE(checkUtcToEpoch(2001, 11, 12, 18, 31, 1));
ASSERT_TRUE(checkUtcToEpoch(1969, 12, 31, 23, 59, 59));
ASSERT_TRUE(checkUtcToEpoch(1969, 12, 31, 23, 59, 58));
ASSERT_TRUE(checkUtcToEpoch(INT32_MAX, 11, 30, 23, 59, 59));
ASSERT_TRUE(checkUtcToEpoch(INT32_MIN, 1, 1, 0, 0, 0));
ASSERT_TRUE(checkUtcToEpoch(
INT32_MAX - INT32_MAX / 11,
INT32_MAX,
INT32_MAX,
INT32_MAX,
INT32_MAX,
INT32_MAX));
ASSERT_TRUE(checkUtcToEpoch(
INT32_MIN - INT32_MIN / 11,
INT32_MIN,
INT32_MIN,
INT32_MIN,
INT32_MIN,
INT32_MIN));
}

TEST(TimestampTest, utcToEpochRandomInputs) {
std::default_random_engine gen(randomSeed());
std::uniform_int_distribution<int32_t> dist(INT32_MIN, INT32_MAX);
for (int i = 0; i < 10'000; ++i) {
checkUtcToEpoch(
dist(gen), dist(gen), dist(gen), dist(gen), dist(gen), dist(gen));
}
}

TEST(TimestampTest, increaseOperator) {
auto ts = Timestamp(0, 999999998);
EXPECT_EQ("1970-01-01T00:00:00.999999998", ts.toString());
Expand Down

0 comments on commit 1c4b9f4

Please sign in to comment.