Skip to content

Commit

Permalink
fix: Casting Varchar to Timestamp should handle offsets that are not …
Browse files Browse the repository at this point in the history
…recognized time zones (#11849)

Summary:

When casting a Varchar to Timestamp, Presto Java allows an offset timestamp in place of a time zone.  This 
string is of the form +/-HH:MM:SS.mmm where all units except the hour are optional, the colons are optional, 
the . is optional and may also be a , and this string must be separated by the date/time by a single space.

This is not interpreted as a time zone but rather simply a milliseconds from UTC and is applied in addition to the 
time zone conversion if a session time zone is present. Note that strings that fit this pattern but are valid time
zones e.g. +02:00 are still treated as time zones.

Since this is not a true time zone it is not allowed when casting from Varchar to TimestampWithTimeZone or any
other string to time conversions (at least AFAICT).

This change updates fromTimestampWithTimezoneString to handle this case and return an offsetMillis in place
of a time zone if one is present.  Casting from Varchar to Timestamp has been updated to apply this offset to
the result, while all other locations where this function is called have been updated to throw if offsetMillis is 
present.

Differential Revision: D67182709
  • Loading branch information
Kevin Wilfong authored and facebook-github-bot committed Dec 13, 2024
1 parent 67a0c04 commit 671ea44
Show file tree
Hide file tree
Showing 9 changed files with 353 additions and 46 deletions.
41 changes: 31 additions & 10 deletions velox/expression/PrestoCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,38 @@ Expected<Timestamp> PrestoCastHooks::castStringToTimestamp(
// If the parsed string has timezone information, convert the timestamp at
// GMT at that time. For example, "1970-01-01 00:00:00 -00:01" is 60 seconds
// at GMT.
if (result.second != nullptr) {
result.first.toGMT(*result.second);

}
// If no timezone information is available in the input string, check if we
// should understand it as being at the session timezone, and if so, convert
// to GMT.
else if (options_.timeZone != nullptr) {
result.first.toGMT(*options_.timeZone);
if (result.timeZone != nullptr) {
result.timestamp.toGMT(*result.timeZone);
} else {
if (result.offsetMillis.has_value()) {
auto seconds = result.timestamp.getSeconds();
// use int128_t to avoid overflow.
int128_t nanos = result.timestamp.getNanos();

seconds += result.offsetMillis.value() / util::kMillisPerSecond;
nanos += (result.offsetMillis.value() % util::kMillisPerSecond) *
util::kNanosPerMicro * util::kMicrosPerMsec;

if (nanos < 0) {
seconds -= 1;
nanos += Timestamp::kNanosInSecond;
}

if (nanos > Timestamp::kMaxNanos) {
seconds += 1;
nanos -= Timestamp::kNanosInSecond;
}

result.timestamp = Timestamp(seconds, nanos);
}
// If no timezone information is available in the input string, check if we
// should understand it as being at the session timezone, and if so, convert
// to GMT.
if (options_.timeZone != nullptr) {
result.timestamp.toGMT(*options_.timeZone);
}
}
return result.first;
return result.timestamp;
}

Expected<Timestamp> PrestoCastHooks::castIntToTimestamp(int64_t seconds) const {
Expand Down
74 changes: 74 additions & 0 deletions velox/expression/tests/CastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,8 +548,41 @@ TEST_F(CastExprTest, stringToTimestamp) {
"1970-01-01 00:00:00-02:00",
"1970-01-01 00:00:00 +02",
"1970-01-01 00:00:00 -0101",
// Fully specified offset.
"1970-01-01 00:00:00 +01:01:01.001",
"1970-01-02 00:00:00 -01:01:01.001",
// Offset with two digit milliseconds.
"1970-01-01 00:00:00 +01:01:01.01",
"1970-01-02 00:00:00 -01:01:01.01",
// Offset with one digit milliseconds.
"1970-01-01 00:00:00 +01:01:01.1",
"1970-01-02 00:00:00 -01:01:01.1",
// Offset without milliseconds.
"1970-01-01 00:00:00 +01:01:01",
"1970-01-02 00:00:00 -01:01:01",
// Offset without seconds.
"1970-01-01 00:00:00 +23:01",
"1970-01-02 00:00:00 -23:01",
// Offset without minutes.
"1970-01-01 00:00:00 +23",
"1970-01-02 00:00:00 -23",
// Upper and lower limits of offsets.
"2000-01-01 12:13:14.123+23:59:59.999",
"2000-01-01 12:13:14.123-23:59:59.999",
// Comma instead of period for decimal in offset.
"1970-01-01 00:00:00 +01:01:01,001",
// Trailing spaces after offset.
"1970-01-02 00:00:00 -01:01:01.001 ",
// Overflow of nanoseconds in offset.
"1970-01-01 00:00:00.999 +01:01:01.002",
// Underflow of nanoseconds in offset.
"1970-01-02 00:00:00.001 -01:01:01.002",
// No optional separators
"1970-01-01 00:00:00 +010101001",
std::nullopt,
};

// two spaces, too large
std::vector<std::optional<Timestamp>> expected{
Timestamp(0, 0),
Timestamp(10800, 0),
Expand All @@ -559,6 +592,25 @@ TEST_F(CastExprTest, stringToTimestamp) {
Timestamp(7200, 0),
Timestamp(-7200, 0),
Timestamp(3660, 0),
Timestamp(3661, 1000000),
Timestamp(82738, 999000000),
Timestamp(3661, 10000000),
Timestamp(82738, 990000000),
Timestamp(3661, 100000000),
Timestamp(82738, 900000000),
Timestamp(3661, 0),
Timestamp(82739, 0),
Timestamp(82860, 0),
Timestamp(3540, 0),
Timestamp(82800, 0),
Timestamp(3600, 0),
Timestamp(946815194, 122000000),
Timestamp(946642394, 124000000),
Timestamp(3661, 1000000),
Timestamp(82738, 999000000),
Timestamp(3662, 1000000),
Timestamp(82738, 999000000),
Timestamp(3661, 1000000),
std::nullopt,
};
testCast<std::string, Timestamp>("timestamp", input, expected);
Expand All @@ -571,13 +623,15 @@ TEST_F(CastExprTest, stringToTimestamp) {
"1970-01-01 00:00 +01:00",
"1970-01-01 00:00 America/Sao_Paulo",
"2000-01-01 12:21:56Z",
"2000-01-01 12:21:56+01:01:01",
};
expected = {
Timestamp(28800, 0),
Timestamp(0, 0),
Timestamp(-3600, 0),
Timestamp(10800, 0),
Timestamp(946729316, 0),
Timestamp(946761777, 0),
};
testCast<std::string, Timestamp>("timestamp", input, expected);

Expand All @@ -602,6 +656,26 @@ TEST_F(CastExprTest, stringToTimestamp) {
(evaluateOnce<Timestamp, std::string>(
"try_cast(c0 as timestamp)", "2045-12-31 18:00:00")),
"Unable to convert timezone 'America/Los_Angeles' past 2037-11-01 09:00:00");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:01:01")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:01:01' to TIMESTAMP. Unknown timezone value: \"\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +24")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +24' to TIMESTAMP. Unknown timezone value: \"+24\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:60")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:60' to TIMESTAMP. Unknown timezone value: \"+01:60\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:01:60")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:01:60' to TIMESTAMP. Unknown timezone value: \"+01:01:60\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:01:01.1000")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:01:01.1000' to TIMESTAMP. Unknown timezone value: \"+01:01:01.1000\"");

setLegacyCast(true);
input = {
Expand Down
3 changes: 2 additions & 1 deletion velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -1520,7 +1520,8 @@ struct FromIso8601Timestamp {
return castResult.error();
}

auto [ts, timeZone] = castResult.value();
auto [ts, timeZone, offsetMillis] = castResult.value();
VELOX_DCHECK(!offsetMillis.has_value());
// Input string may not contain a timezone - if so, it is interpreted in
// session timezone.
if (!timeZone) {
Expand Down
3 changes: 3 additions & 0 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4366,6 +4366,9 @@ TEST_F(DateTimeFunctionsTest, fromIso8601Timestamp) {
VELOX_ASSERT_THROW(
fromIso("1970-01-02T11:38:56.123 America/New_York"),
R"(Unable to parse timestamp value: "1970-01-02T11:38:56.123 America/New_York")");
VELOX_ASSERT_THROW(
fromIso("1970-01-02T11:38:56+16:00:01"),
"Unknown timezone value: \"+16:00:01\"");

VELOX_ASSERT_THROW(fromIso("T"), R"(Unable to parse timestamp value: "T")");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarcharInvalidInput) {
const auto invalidStringVector6 =
makeNullableFlatVector<StringView>({"2012-10 America/Los_Angeles"});

const auto invalidStringVector7 =
makeNullableFlatVector<StringView>({"2012-10-01 +16:00"});

auto millis = parseTimestamp("2012-10-31 07:00:47").toMillis();
auto timestamps = std::vector<int64_t>{millis};

Expand Down Expand Up @@ -204,6 +207,9 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarcharInvalidInput) {
VELOX_ASSERT_THROW(
testCast(invalidStringVector6, expected),
"Unable to parse timestamp value: \"2012-10 America/Los_Angeles\"");
VELOX_ASSERT_THROW(
testCast(invalidStringVector7, expected),
"Unknown timezone value in: \"2012-10-01 +16:00\"");
}

TEST_F(TimestampWithTimeZoneCastTest, toTimestamp) {
Expand Down
10 changes: 9 additions & 1 deletion velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,18 @@ void castFromString(
if (castResult.hasError()) {
context.setStatus(row, castResult.error());
} else {
auto [ts, timeZone] = castResult.value();
auto [ts, timeZone, millisOffset] = castResult.value();
// Input string may not contain a timezone - if so, it is interpreted in
// session timezone.
if (timeZone == nullptr) {
if (millisOffset.has_value()) {
context.setStatus(
row,
Status::UserError(
"Unknown timezone value in: \"{}\"",
inputVector.valueAt(row)));
return;
}
const auto& config = context.execCtx()->queryCtx()->queryConfig();
timeZone = getTimeZoneFromConfig(config);
}
Expand Down
137 changes: 125 additions & 12 deletions velox/type/TimestampConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,11 +529,116 @@ bool tryParseTimeString(
return true;
}

// Parses a variety of timestamp strings, depending on the value of `parseMode`.
// Consumes as much of the string as it can and sets `result` to the
// timestamp from whatever it successfully parses. `pos` is set to the position
// of first character that was not consumed. Returns true if it successfully
// parsed at least a date, `result` is only set if true is returned.
// String format is [+/-]hh:mm:ss.MMM (minutes, seconds, and milliseconds are
// optional).
bool tryParsePrestoTimeOffsetString(
const char* buf,
size_t len,
size_t& pos,
int64_t& result) {
static constexpr int sep = ':';
int32_t hour = 0, min = 0, sec = 0, millis = 0;
pos = 0;
result = 0;

if (len == 0) {
return false;
}

if (buf[pos] != '+' && buf[pos] != '-') {
return false;
}

bool positive = buf[pos++] == '+';

if (pos >= len) {
return false;
}

// Read the hours.
if (!parseDoubleDigit(buf, len, pos, hour)) {
return false;
}
if (hour < 0 || hour >= 24) {
return false;
}

result += hour * kMillisPerHour;

if (pos >= len || (buf[pos] != sep && !characterIsDigit(buf[pos]))) {
result *= positive ? 1 : -1;
return pos == len;
}

// Skip the separator.
if (buf[pos] == sep) {
pos++;
}

// Read the minutes.
if (!parseDoubleDigit(buf, len, pos, min)) {
return false;
}
if (min < 0 || min >= 60) {
return false;
}

result += min * kMillisPerMinute;

if (pos >= len || (buf[pos] != sep && !characterIsDigit(buf[pos]))) {
result *= positive ? 1 : -1;
return pos == len;
}

// Skip the separator.
if (buf[pos] == sep) {
pos++;
}

// Try to read seconds.
if (!parseDoubleDigit(buf, len, pos, sec)) {
return false;
}
if (sec < 0 || sec >= 60) {
return false;
}

result += sec * kMillisPerSecond;

if (pos >= len ||
(buf[pos] != '.' && buf[pos] != ',' && !characterIsDigit(buf[pos]))) {
result *= positive ? 1 : -1;
return pos == len;
}

// Skip the decimal.
if (buf[pos] == '.' || buf[pos] == ',') {
pos++;
}

// Try to read microseconds.
if (pos >= len) {
return false;
}

// We expect milliseconds.
int32_t mult = 100;
for (; pos < len && mult > 0 && characterIsDigit(buf[pos]);
pos++, mult /= 10) {
millis += (buf[pos] - '0') * mult;
}

result += millis;
result *= positive ? 1 : -1;
return pos == len;
}

// Parses a variety of timestamp strings, depending on the value of
// `parseMode`. Consumes as much of the string as it can and sets `result` to
// the timestamp from whatever it successfully parses. `pos` is set to the
// position of first character that was not consumed. Returns true if it
// successfully parsed at least a date, `result` is only set if true is
// returned.
bool tryParseTimestampString(
const char* buf,
size_t len,
Expand Down Expand Up @@ -583,8 +688,8 @@ bool tryParseTimestampString(
if (!tryParseTimeString(
buf + pos, len - pos, timePos, microsSinceMidnight, parseMode)) {
// The rest of the string is not a valid time, but it could be relevant to
// the caller (e.g. it could be a time zone), return the date we parsed and
// let them decide what to do with the rest.
// the caller (e.g. it could be a time zone), return the date we parsed
// and let them decide what to do with the rest.
result = fromDatetime(daysSinceEpoch, 0);
return true;
}
Expand Down Expand Up @@ -857,8 +962,7 @@ fromTimestampString(const char* str, size_t len, TimestampParseMode parseMode) {
return resultTimestamp;
}

Expected<std::pair<Timestamp, const tz::TimeZone*>>
fromTimestampWithTimezoneString(
Expected<ParsedTimestampWithTimeZone> fromTimestampWithTimezoneString(
const char* str,
size_t len,
TimestampParseMode parseMode) {
Expand All @@ -870,6 +974,7 @@ fromTimestampWithTimezoneString(
}

const tz::TimeZone* timeZone = nullptr;
std::optional<int64_t> offset = std::nullopt;

if (pos < len && parseMode != TimestampParseMode::kIso8601 &&
characterIsSpace(str[pos])) {
Expand All @@ -894,8 +999,16 @@ fromTimestampWithTimezoneString(
std::string_view timeZoneName(str + pos, timezonePos - pos);

if ((timeZone = tz::locateZone(timeZoneName, false)) == nullptr) {
return folly::makeUnexpected(
Status::UserError("Unknown timezone value: \"{}\"", timeZoneName));
int64_t offsetMillis = 0;
size_t offsetPos = 0;
if (parseMode == TimestampParseMode::kPrestoCast &&
tryParsePrestoTimeOffsetString(
str + pos, timezonePos - pos, offsetPos, offsetMillis)) {
offset = offsetMillis;
} else {
return folly::makeUnexpected(
Status::UserError("Unknown timezone value: \"{}\"", timeZoneName));
}
}

// Skip any spaces at the end.
Expand All @@ -908,7 +1021,7 @@ fromTimestampWithTimezoneString(
return folly::makeUnexpected(parserError(str, len));
}
}
return std::make_pair(resultTimestamp, timeZone);
return {{resultTimestamp, timeZone, offset}};
}

int32_t toDate(const Timestamp& timestamp, const tz::TimeZone* timeZone_) {
Expand Down
Loading

0 comments on commit 671ea44

Please sign in to comment.