diff --git a/.github/workflows/asan_test.yml b/.github/workflows/asan_test.yml index f4a31525f3..0638c13dc1 100644 --- a/.github/workflows/asan_test.yml +++ b/.github/workflows/asan_test.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Address Sanitizer Tests +name: Address/Undefined Sanitizer Tests on: pull_request: @@ -47,18 +47,19 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - - name: Configure and Build with ASAN + - name: Configure and Build with ASAN and UBSAN env: CC: ${{ matrix.cc }} CXX: ${{ matrix.cxx }} run: | mkdir build && cd build - cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DBUILD_JAVA=OFF + cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DENABLE_UBSAN=ON -DBUILD_ENABLE_AVX512=ON -DBUILD_CPP_ENABLE_METRICS=ON -DBUILD_JAVA=OFF make - name: Run Tests working-directory: build env: ASAN_OPTIONS: detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=0:detect_container_overflow=0 LSAN_OPTIONS: suppressions=${{ github.workspace }}/.github/lsan-suppressions.txt + UBSAN_OPTIONS: print_stacktrace=1 run: | ctest --output-on-failure diff --git a/CMakeLists.txt b/CMakeLists.txt index eadc13eae6..502fe2fe02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,10 @@ option(ORC_ENABLE_CLANG_TOOLS "Enable Clang tools" ON) +option(ENABLE_UBSAN + "Enable Undefined Behavior Sanitizer" + OFF) + # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to ReleaseWithDebugInfo") @@ -171,6 +175,17 @@ if (ENABLE_ASAN) endif() endif() +# Configure Undefined Behavior Sanitizer if enabled +if (ENABLE_UBSAN) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all") + message(STATUS "Undefined Behavior Sanitizer enabled") + else() + message(WARNING "Undefined Behavior Sanitizer is only supported for GCC and Clang compilers") + endif() +endif() + enable_testing() INCLUDE(GNUInstallDirs) # Put it before ThirdpartyToolchain to make CMAKE_INSTALL_LIBDIR available. diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh index 6954c771cf..e728e70e7b 100644 --- a/c++/include/orc/Int128.hh +++ b/c++/include/orc/Int128.hh @@ -193,43 +193,13 @@ namespace orc { * Shift left by the given number of bits. * Values larger than 2**127 will shift into the sign bit. */ - Int128& operator<<=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - highbits_ <<= bits; - highbits_ |= (lowbits_ >> (64 - bits)); - lowbits_ <<= bits; - } else if (bits < 128) { - highbits_ = static_cast(lowbits_) << (bits - 64); - lowbits_ = 0; - } else { - highbits_ = 0; - lowbits_ = 0; - } - } - return *this; - } + Int128& operator<<=(uint32_t bits); /** * Shift right by the given number of bits. Negative values will * sign extend and fill with one bits. */ - Int128& operator>>=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - lowbits_ >>= bits; - lowbits_ |= static_cast(highbits_ << (64 - bits)); - highbits_ = static_cast(static_cast(highbits_) >> bits); - } else if (bits < 128) { - lowbits_ = static_cast(highbits_ >> (bits - 64)); - highbits_ = highbits_ >= 0 ? 0 : -1l; - } else { - highbits_ = highbits_ >= 0 ? 0 : -1l; - lowbits_ = static_cast(highbits_); - } - } - return *this; - } + Int128& operator>>=(uint32_t bits); bool operator==(const Int128& right) const { return highbits_ == right.highbits_ && lowbits_ == right.lowbits_; diff --git a/c++/src/Adaptor.hh.in b/c++/src/Adaptor.hh.in index 2cce8158e2..f3ed763eb3 100644 --- a/c++/src/Adaptor.hh.in +++ b/c++/src/Adaptor.hh.in @@ -49,6 +49,12 @@ typedef SSIZE_T ssize_t; ssize_t pread(int fd, void* buf, size_t count, off_t offset); #endif +#if defined(__GNUC__) || defined(__clang__) + #define NO_SANITIZE_ATTR __attribute__((no_sanitize("signed-integer-overflow", "shift"))) +#else + #define NO_SANITIZE_ATTR +#endif + #ifdef HAS_DIAGNOSTIC_PUSH #ifdef __clang__ #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") diff --git a/c++/src/BloomFilter.cc b/c++/src/BloomFilter.cc index 887637223a..025bdd8a03 100644 --- a/c++/src/BloomFilter.cc +++ b/c++/src/BloomFilter.cc @@ -208,7 +208,7 @@ namespace orc { } DIAGNOSTIC_POP - + NO_SANITIZE_ATTR void BloomFilterImpl::addHash(int64_t hash64) { int32_t hash1 = static_cast(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. @@ -226,6 +226,7 @@ namespace orc { } } + NO_SANITIZE_ATTR bool BloomFilterImpl::testHash(int64_t hash64) const { int32_t hash1 = static_cast(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. diff --git a/c++/src/BloomFilter.hh b/c++/src/BloomFilter.hh index ebc4a5ee04..d16abe933c 100644 --- a/c++/src/BloomFilter.hh +++ b/c++/src/BloomFilter.hh @@ -19,6 +19,7 @@ #ifndef ORC_BLOOMFILTER_IMPL_HH #define ORC_BLOOMFILTER_IMPL_HH +#include "Adaptor.hh" #include "orc/BloomFilter.hh" #include "wrap/orc-proto-wrapper.hh" @@ -194,6 +195,7 @@ namespace orc { // Thomas Wang's integer hash function // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm // Put this in header file so tests can use it as well. + NO_SANITIZE_ATTR inline int64_t getLongHash(int64_t key) { key = (~key) + (key << 21); // key = (key << 21) - key - 1; key = key ^ (key >> 24); diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index af434c37ca..0fd17de1b8 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -726,6 +726,9 @@ namespace orc { if (totalBytes <= lastBufferLength_) { // subtract the needed bytes from the ones left over lastBufferLength_ -= totalBytes; + if (lastBuffer_ == nullptr) { + throw ParseError("StringDirectColumnReader::skip: lastBuffer_ is null"); + } lastBuffer_ += totalBytes; } else { // move the stream forward after accounting for the buffered bytes @@ -780,7 +783,9 @@ namespace orc { byteBatch.blob.resize(totalLength); char* ptr = byteBatch.blob.data(); while (bytesBuffered + lastBufferLength_ < totalLength) { - memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_); + if (lastBuffer_ != nullptr) { + memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_); + } bytesBuffered += lastBufferLength_; const void* readBuffer; int readLength; diff --git a/c++/src/ConvertColumnReader.cc b/c++/src/ConvertColumnReader.cc index a9003bc163..068af49acf 100644 --- a/c++/src/ConvertColumnReader.cc +++ b/c++/src/ConvertColumnReader.cc @@ -126,13 +126,13 @@ namespace orc { bool shouldThrow) { constexpr bool isFileTypeFloatingPoint(std::is_floating_point::value); constexpr bool isReadTypeFloatingPoint(std::is_floating_point::value); - int64_t longValue = static_cast(srcValue); + if (isFileTypeFloatingPoint) { if (isReadTypeFloatingPoint) { destValue = static_cast(srcValue); } else { if (!canFitInLong(static_cast(srcValue)) || - !downCastToInteger(destValue, longValue)) { + !downCastToInteger(destValue, static_cast(srcValue))) { handleOverflow(destBatch, idx, shouldThrow); } } diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc index 1e059fd4e2..0d4da78b5a 100644 --- a/c++/src/Int128.cc +++ b/c++/src/Int128.cc @@ -25,6 +25,41 @@ #include namespace orc { + NO_SANITIZE_ATTR + Int128& Int128::operator<<=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + highbits_ <<= bits; + highbits_ |= (lowbits_ >> (64 - bits)); + lowbits_ <<= bits; + } else if (bits < 128) { + highbits_ = static_cast(lowbits_) << (bits - 64); + lowbits_ = 0; + } else { + highbits_ = 0; + lowbits_ = 0; + } + } + return *this; + } + + NO_SANITIZE_ATTR + Int128& Int128::operator>>=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + lowbits_ >>= bits; + lowbits_ |= static_cast(highbits_ << (64 - bits)); + highbits_ = static_cast(static_cast(highbits_) >> bits); + } else if (bits < 128) { + lowbits_ = static_cast(highbits_ >> (bits - 64)); + highbits_ = highbits_ >= 0 ? 0 : -1l; + } else { + highbits_ = highbits_ >= 0 ? 0 : -1l; + lowbits_ = static_cast(highbits_); + } + } + return *this; + } Int128 Int128::maximumValue() { return Int128(0x7fffffffffffffff, 0xffffffffffffffff); diff --git a/c++/src/RLE.cc b/c++/src/RLE.cc index cb831c80f7..19ca558fc6 100644 --- a/c++/src/RLE.cc +++ b/c++/src/RLE.cc @@ -77,6 +77,7 @@ namespace orc { add(data, numValues, notNull); } + NO_SANITIZE_ATTR void RleEncoder::writeVslong(int64_t val) { writeVulong((val << 1) ^ (val >> 63)); } diff --git a/c++/src/RLE.hh b/c++/src/RLE.hh index e46504e885..b9905d16c8 100644 --- a/c++/src/RLE.hh +++ b/c++/src/RLE.hh @@ -19,6 +19,7 @@ #ifndef ORC_RLE_HH #define ORC_RLE_HH +#include "Adaptor.hh" #include "io/InputStream.hh" #include "io/OutputStream.hh" @@ -26,6 +27,7 @@ namespace orc { + NO_SANITIZE_ATTR inline int64_t zigZag(int64_t value) { return (value << 1) ^ (value >> 63); }