diff --git a/CMake/ResolveDependency.cmake b/CMake/ResolveDependency.cmake index eba371fbcd60..03b5ae386318 100644 --- a/CMake/ResolveDependency.cmake +++ b/CMake/ResolveDependency.cmake @@ -102,6 +102,10 @@ function(set_with_default var_name envvar_name default) set(${var_name} $ENV{${envvar_name}} PARENT_SCOPE) + elseif(DEFINED ${envvar_name}) + set(${var_name} + ${${envvar_name}} + PARENT_SCOPE) else() set(${var_name} ${default} diff --git a/CMake/resolve_dependency_modules/folly/CMakeLists.txt b/CMake/resolve_dependency_modules/folly/CMakeLists.txt index ffe09fb14b91..ce94132f672d 100644 --- a/CMake/resolve_dependency_modules/folly/CMakeLists.txt +++ b/CMake/resolve_dependency_modules/folly/CMakeLists.txt @@ -28,13 +28,19 @@ message(STATUS "Building Folly from source") if(gflags_SOURCE STREQUAL "BUNDLED") set(glog_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-gflags-glog.patch) endif() +if(VELOX_ENABLE_GPU) + set(cudacc_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-cudacc.patch) +endif() + +set(VELOX_FOLLY_PATCH_COMMAND + git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch ${glog_patch} + ${cudacc_patch}) FetchContent_Declare( folly URL ${VELOX_FOLLY_SOURCE_URL} URL_HASH ${VELOX_FOLLY_BUILD_SHA256_CHECKSUM} - PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch - ${glog_patch}) + PATCH_COMMAND ${VELOX_FOLLY_PATCH_COMMAND}) if(ON_APPLE_M1) # folly will wrongly assume x86_64 if this is not set diff --git a/CMake/resolve_dependency_modules/folly/folly-cudacc.patch b/CMake/resolve_dependency_modules/folly/folly-cudacc.patch new file mode 100644 index 000000000000..2a2079f87631 --- /dev/null +++ b/CMake/resolve_dependency_modules/folly/folly-cudacc.patch @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Rivos, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- a/folly/Conv.h ++++ b/folly/Conv.h +@@ -286,8 +286,8 @@ namespace detail { + template + using LastElement = type_pack_element_t; + +-#ifdef _MSC_VER +-// MSVC can't quite figure out the LastElementImpl::call() stuff ++#if defined(_MSC_VER) || defined(__CUDACC__) ++// MSVC and NVCC can't quite figure out the LastElementImpl::call() stuff + // in the base implementation, so we have to use tuples instead, + // which result in significantly more templates being compiled, + // though the runtime performance is the same. +--- a/folly/synchronization/RelaxedAtomic.h ++++ b/folly/synchronization/RelaxedAtomic.h +@@ -98,7 +98,7 @@ struct relaxed_atomic_base : protected std::atomic { + }; + + template +-struct relaxed_atomic_integral_base : private relaxed_atomic_base { ++struct relaxed_atomic_integral_base : protected relaxed_atomic_base { + private: + using atomic = std::atomic; + using base = relaxed_atomic_base; +@@ -108,7 +108,9 @@ struct relaxed_atomic_integral_base : private relaxed_atomic_base { + + using base::relaxed_atomic_base; + using base::operator=; ++#ifndef __CUDACC__ + using base::operator T; ++#endif + using base::compare_exchange_strong; + using base::compare_exchange_weak; + using base::exchange; +@@ -206,7 +208,9 @@ struct relaxed_atomic : detail::relaxed_atomic_base { + + using base::relaxed_atomic_base; + using base::operator=; ++#ifndef __CUDACC__ + using base::operator T; ++#endif + }; + + template +@@ -220,7 +224,9 @@ struct relaxed_atomic : detail::relaxed_atomic_base { + + using detail::relaxed_atomic_base::relaxed_atomic_base; + using base::operator=; ++#ifndef __CUDACC__ + using base::operator T*; ++#endif + + T* fetch_add(std::ptrdiff_t arg) noexcept { + return atomic::fetch_add(arg, std::memory_order_relaxed); diff --git a/CMakeLists.txt b/CMakeLists.txt index d3eb572c90a3..3930bcfc1e90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -371,8 +371,15 @@ message("FINAL CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}") if(${VELOX_ENABLE_GPU}) enable_language(CUDA) + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "12.2") + message(FATAL_ERROR "CUDA version less than 12.2 not supported") + endif() # Determine CUDA_ARCHITECTURES automatically. cmake_policy(SET CMP0104 NEW) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # it will fail later in the build otherwise + message(FATAL_ERROR "-DCMAKE_CUDA_ARCHITECTURES= must be set") + endif() if(CMAKE_BUILD_TYPE MATCHES Debug) add_compile_options("$<$:-G>") endif() diff --git a/Makefile b/Makefile index 82de59432436..227f79497740 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,14 @@ ifdef AZURESDK_ROOT_DIR CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR) endif +ifdef CUDA_ARCHITECTURES +CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)" +endif + +ifdef CUDA_COMPILER +CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)" +endif + # Use Ninja if available. If Ninja is used, pass through parallelism control flags. USE_NINJA ?= 1 ifeq ($(USE_NINJA), 1) @@ -108,6 +116,16 @@ minimal: #: Minimal build $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON" $(MAKE) build BUILD_DIR=release +# We specify -Dfolly_SOURCE=BUNDLED as we need to pick up the fix at CMake/resolve_dependency_modules/folly/folly-cudacc.patch +gpu: #: Build with GPU support + $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED" + $(MAKE) build BUILD_DIR=release + +# We specify -Dfolly_SOURCE=BUNDLED as we need to pick up the fix at CMake/resolve_dependency_modules/folly/folly-cudacc.patch +gpu_debug: #: Build with debugging symbols and GPU support + $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED" + $(MAKE) build BUILD_DIR=debug + dwio: #: Minimal build with dwio enabled. $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \ -DVELOX_BUILD_MINIMAL_WITH_DWIO=ON" diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh index 4f0a11e152fd..ce783cfff5dc 100644 --- a/scripts/setup-helper-functions.sh +++ b/scripts/setup-helper-functions.sh @@ -99,9 +99,9 @@ function get_cxx_flags { CPU_ARCH="arm64" fi - # On MacOs prevent the flood of translation visibility settings warnings. - ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden" - else [ "$OS" = "Linux" ]; + # On MacOs prevent the flood of translation visibility settings warnings. + ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden" + elif [ "$OS" = "Linux" ]; then local CPU_CAPABILITIES CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}') @@ -133,8 +133,11 @@ function get_cxx_flags { "aarch64") echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS" ;; - *) - echo -n "Architecture not supported!" + + *) + echo "Architecture not supported: CPU_ARCH=$CPU_ARCH" 1>&2 + exit 1 + ;; esac } diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index 38d4f1f26fe9..e74922cca920 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -2404,3 +2404,14 @@ struct fmt::formatter : formatter { return formatter::format(static_cast(s), ctx); } }; + +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::core::AggregationNode::Step s, + format_context& ctx) { + return formatter::format( + facebook::velox::core::mapAggregationStepToName(s), ctx); + } +}; diff --git a/velox/experimental/gpu/tests/CMakeLists.txt b/velox/experimental/gpu/tests/CMakeLists.txt index 8eaca86e7b95..4dc31e422fbf 100644 --- a/velox/experimental/gpu/tests/CMakeLists.txt +++ b/velox/experimental/gpu/tests/CMakeLists.txt @@ -14,5 +14,3 @@ add_executable(velox_gpu_hash_table_test HashTableTest.cu) target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags) -set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES - native) diff --git a/velox/experimental/gpu/tests/HashTableTest.cu b/velox/experimental/gpu/tests/HashTableTest.cu index d1d5ac23ef25..79433295f6fe 100644 --- a/velox/experimental/gpu/tests/HashTableTest.cu +++ b/velox/experimental/gpu/tests/HashTableTest.cu @@ -35,7 +35,7 @@ namespace { constexpr int kBlockSize = 256; -__device__ uint32_t jenkinsRevMix32(uint32_t key) { +[[maybe_unused]] __device__ uint32_t jenkinsRevMix32(uint32_t key) { key += (key << 12); // key *= (1 + (1 << 12)) key ^= (key >> 22); key += (key << 4); // key *= (1 + (1 << 4)) @@ -299,6 +299,7 @@ __global__ void probe( cmpMask = 0xffffffff; } end: + ; } } @@ -628,6 +629,7 @@ __global__ void probePartitioned( cmpMask = 0xffffffff; } end: + ; } } diff --git a/velox/experimental/wave/common/CMakeLists.txt b/velox/experimental/wave/common/CMakeLists.txt index 205d945331ae..a33fffffba35 100644 --- a/velox/experimental/wave/common/CMakeLists.txt +++ b/velox/experimental/wave/common/CMakeLists.txt @@ -15,8 +15,6 @@ add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp Type.cpp) -set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native) - target_link_libraries(velox_wave_common velox_exception velox_common_base velox_type) diff --git a/velox/experimental/wave/common/Cuda.cu b/velox/experimental/wave/common/Cuda.cu index 48a1e6804029..a96c8d8d2662 100644 --- a/velox/experimental/wave/common/Cuda.cu +++ b/velox/experimental/wave/common/Cuda.cu @@ -171,7 +171,7 @@ struct EventImpl { Event::Event(bool withTime) : hasTiming_(withTime) { event_ = std::make_unique(); CUDA_CHECK( - cudaEventCreate(&event_->event, withTime ? 0 : cudaEventDisableTiming)); + cudaEventCreateWithFlags(&event_->event, withTime ? 0 : cudaEventDisableTiming)); } Event::~Event() {} diff --git a/velox/experimental/wave/common/Type.cpp b/velox/experimental/wave/common/Type.cpp index aa833e47a81c..e403cc8c5780 100644 --- a/velox/experimental/wave/common/Type.cpp +++ b/velox/experimental/wave/common/Type.cpp @@ -50,4 +50,33 @@ PhysicalType fromCpuType(const Type& type) { return ans; } +std::string_view PhysicalType::kindString(Kind kind) { + switch (kind) { + case kInt8: + return "Int8"; + case kInt16: + return "Int16"; + case kInt32: + return "Int32"; + case kInt64: + return "Int64"; + case kInt128: + return "Int128"; + case kFloat32: + return "Float32"; + case kFloat64: + return "Float64"; + case kString: + return "String"; + case kArray: + return "Array"; + case kMap: + return "Map"; + case kRow: + return "Row"; + } + + VELOX_UNREACHABLE(); +} + } // namespace facebook::velox::wave diff --git a/velox/experimental/wave/common/Type.h b/velox/experimental/wave/common/Type.h index 597b153ac5cc..82a535f7cfcf 100644 --- a/velox/experimental/wave/common/Type.h +++ b/velox/experimental/wave/common/Type.h @@ -16,7 +16,12 @@ #pragma once +#include #include +#include +#if FMT_VERSION >= 100100 +#include +#endif namespace facebook::velox { class Type; @@ -40,8 +45,21 @@ struct PhysicalType { } kind; int32_t numChildren; PhysicalType** children; + + static std::string_view kindString(Kind kind); }; PhysicalType fromCpuType(const Type&); } // namespace facebook::velox::wave + +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::wave::PhysicalType::Kind s, + format_context& ctx) { + return formatter::format( + facebook::velox::wave::PhysicalType::kindString(s), ctx); + } +}; diff --git a/velox/experimental/wave/common/tests/BlockTest.cpp b/velox/experimental/wave/common/tests/BlockTest.cpp index 6c6d8b20f2f4..d1ac6e9ac4d1 100644 --- a/velox/experimental/wave/common/tests/BlockTest.cpp +++ b/velox/experimental/wave/common/tests/BlockTest.cpp @@ -59,7 +59,7 @@ TEST_F(BlockTest, boolToIndices) { std::vector referenceIndices(kNumFlags); std::vector referenceSizes(kNumBlocks); uint8_t* flags = flagsBuffer->as(); - for (auto i = 0; i < kNumFlags; ++i) { + for (size_t i = 0; i < kNumFlags; ++i) { if ((i >> 8) % 17 == 0) { flags[i] = 0; } else if ((i >> 8) % 23 == 0) { diff --git a/velox/experimental/wave/common/tests/BlockTest.cu b/velox/experimental/wave/common/tests/BlockTest.cu index 830fa1358c0e..eccb461e3537 100644 --- a/velox/experimental/wave/common/tests/BlockTest.cu +++ b/velox/experimental/wave/common/tests/BlockTest.cu @@ -11,7 +11,7 @@ __global__ void boolToIndices( int32_t** indices, int32_t* sizes, int64_t* times) { - extern __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[]; + extern __shared__ char smem[]; int32_t idx = blockIdx.x; // Start cycle timer clock_t start = clock(); @@ -41,8 +41,7 @@ void BlockTestStream::testBoolToIndices( } __global__ void sum64(int64_t* numbers, int64_t* results) { - extern __shared__ __align__( - alignof(cub::BlockReduce::TempStorage)) char smem[]; + extern __shared__ char smem[]; int32_t idx = blockIdx.x; blockSum<256>( [&]() { return numbers[idx * 256 + threadIdx.x]; }, smem, results); diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt index 159261e72a80..f9d2a3305eec 100644 --- a/velox/experimental/wave/common/tests/CMakeLists.txt +++ b/velox/experimental/wave/common/tests/CMakeLists.txt @@ -15,9 +15,6 @@ add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu BlockTest.cpp BlockTest.cu) -set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES - native) - add_test(velox_wave_common_test velox_wave_common_test) target_link_libraries( diff --git a/velox/experimental/wave/exec/CMakeLists.txt b/velox/experimental/wave/exec/CMakeLists.txt index a346b917a4a0..74d216932b58 100644 --- a/velox/experimental/wave/exec/CMakeLists.txt +++ b/velox/experimental/wave/exec/CMakeLists.txt @@ -26,8 +26,6 @@ add_library( Wave.cpp Project.cpp) -set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native) - target_link_libraries(velox_wave_exec velox_wave_vector velox_wave_common velox_exception velox_common_base velox_exec) diff --git a/velox/experimental/wave/exec/ExprKernel.h b/velox/experimental/wave/exec/ExprKernel.h index ec4a2b274766..3611e7de7af7 100644 --- a/velox/experimental/wave/exec/ExprKernel.h +++ b/velox/experimental/wave/exec/ExprKernel.h @@ -17,6 +17,7 @@ #pragma once #include +#include "velox/common/base/Exceptions.h" #include "velox/experimental/wave/common/Cuda.h" #include "velox/experimental/wave/exec/ErrorCode.h" #include "velox/experimental/wave/vector/Operand.h" @@ -62,6 +63,43 @@ enum class OpCode { }; +inline std::ostream& operator<<(std::ostream& out, const OpCode& opcode) { + switch (opcode) { + case OpCode::kFilter: + return out << "Filter"; + case OpCode::kWrap: + return out << "Wrap"; + case OpCode::kPlus: + return out << "Plus"; + case OpCode::kMinus: + return out << "Minus"; + case OpCode::kTimes: + return out << "Times"; + case OpCode::kDivide: + return out << "Divide"; + case OpCode::kEquals: + return out << "Equals"; + case OpCode::kLT: + return out << "LT"; + case OpCode::kLTE: + return out << "LTE"; + case OpCode::kGT: + return out << "GT"; + case OpCode::kGTE: + return out << "GTE"; + case OpCode::kNE: + return out << "NE"; + } + + VELOX_UNREACHABLE(); +} + +inline std::string mapOpCodeToName(const OpCode& opcode) { + std::stringstream ss; + ss << opcode; + return ss.str(); +} + #define OP_MIX(op, t) \ static_cast(static_cast(t) + 8 * static_cast(op)) @@ -151,3 +189,11 @@ class WaveKernelStream : public Stream { }; } // namespace facebook::velox::wave + +template <> +struct fmt::formatter : formatter { + auto format(facebook::velox::wave::OpCode o, format_context& ctx) { + return formatter::format( + facebook::velox::wave::mapOpCodeToName(o), ctx); + } +}; diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt index 74feddf6cf06..c3fa76bc8ea8 100644 --- a/velox/experimental/wave/exec/tests/CMakeLists.txt +++ b/velox/experimental/wave/exec/tests/CMakeLists.txt @@ -14,8 +14,6 @@ add_executable(velox_wave_exec_test FilterProjectTest.cpp Main.cpp) -set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES native) - add_test(velox_wave_exec_test velox_wave_exec_test) target_link_libraries( diff --git a/velox/experimental/wave/vector/tests/CMakeLists.txt b/velox/experimental/wave/vector/tests/CMakeLists.txt index 2d4201c4c4cc..5423006f52c8 100644 --- a/velox/experimental/wave/vector/tests/CMakeLists.txt +++ b/velox/experimental/wave/vector/tests/CMakeLists.txt @@ -14,9 +14,6 @@ add_executable(velox_wave_vector_test VectorTest.cpp) -set_target_properties(velox_wave_vector_test PROPERTIES CUDA_ARCHITECTURES - native) - add_test(veloxwave__vector_test velox_wave_vector_test) target_link_libraries(