Fix building for GPU with CUDA 12.3.1

This change adds just enough support to build Velox with VELOX_ENABLE_GPU=ON with the CUDA 12.3.1 toolchain
facebookincubator · Feb 26, 2024 · c1a288c · c1a288c
1 parent 52ad7f5
commit c1a288c
Show file tree

Hide file tree

Showing 18 changed files with 213 additions and 26 deletions.
diff --git a/CMake/ResolveDependency.cmake b/CMake/ResolveDependency.cmake
@@ -102,6 +102,10 @@ function(set_with_default var_name envvar_name default)
     set(${var_name}
         $ENV{${envvar_name}}
         PARENT_SCOPE)
+  elseif(DEFINED ${envvar_name})
+    set(${var_name}
+        ${${envvar_name}}
+        PARENT_SCOPE)
   else()
     set(${var_name}
         ${default}

diff --git a/CMake/resolve_dependency_modules/folly/CMakeLists.txt b/CMake/resolve_dependency_modules/folly/CMakeLists.txt
@@ -28,13 +28,18 @@ message(STATUS "Building Folly from source")
 if(gflags_SOURCE STREQUAL "BUNDLED")
   set(glog_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-gflags-glog.patch)
 endif()
+if(VELOX_ENABLE_GPU)
+  set(cudacc_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-cudacc.patch)
+endif()
+
+set(VELOX_FOLLY_PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch
+                                ${glog_patch} ${cudacc_patch})
 
 FetchContent_Declare(
   folly
   URL ${VELOX_FOLLY_SOURCE_URL}
   URL_HASH ${VELOX_FOLLY_BUILD_SHA256_CHECKSUM}
-  PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch
-                ${glog_patch})
+  PATCH_COMMAND ${VELOX_FOLLY_PATCH_COMMAND})
 
 if(ON_APPLE_M1)
   # folly will wrongly assume x86_64 if this is not set

diff --git a/CMake/resolve_dependency_modules/folly/folly-cudacc.patch b/CMake/resolve_dependency_modules/folly/folly-cudacc.patch
@@ -0,0 +1,68 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Rivos, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+--- a/folly/Conv.h
++++ b/folly/Conv.h
+@@ -286,8 +286,8 @@ namespace detail {
+ template <class... T>
+ using LastElement = type_pack_element_t<sizeof...(T) - 1, T...>;
+
+-#ifdef _MSC_VER
+-// MSVC can't quite figure out the LastElementImpl::call() stuff
++#if defined(_MSC_VER) || defined(__CUDACC__)
++// MSVC and NVCC can't quite figure out the LastElementImpl::call() stuff
+ // in the base implementation, so we have to use tuples instead,
+ // which result in significantly more templates being compiled,
+ // though the runtime performance is the same.
+--- a/folly/synchronization/RelaxedAtomic.h
++++ b/folly/synchronization/RelaxedAtomic.h
+@@ -98,7 +98,7 @@ struct relaxed_atomic_base : protected std::atomic<T> {
+ };
+
+ template <typename T>
+-struct relaxed_atomic_integral_base : private relaxed_atomic_base<T> {
++struct relaxed_atomic_integral_base : protected relaxed_atomic_base<T> {
+  private:
+   using atomic = std::atomic<T>;
+   using base = relaxed_atomic_base<T>;
+@@ -108,7 +108,9 @@ struct relaxed_atomic_integral_base : private relaxed_atomic_base<T> {
+
+   using base::relaxed_atomic_base;
+   using base::operator=;
++#ifndef __CUDACC__
+   using base::operator T;
++#endif
+   using base::compare_exchange_strong;
+   using base::compare_exchange_weak;
+   using base::exchange;
+@@ -206,7 +208,9 @@ struct relaxed_atomic : detail::relaxed_atomic_base<T> {
+
+   using base::relaxed_atomic_base;
+   using base::operator=;
++#ifndef __CUDACC__
+   using base::operator T;
++#endif
+ };
+
+ template <typename T>
+@@ -220,7 +224,9 @@ struct relaxed_atomic<T*> : detail::relaxed_atomic_base<T*> {
+
+   using detail::relaxed_atomic_base<T*>::relaxed_atomic_base;
+   using base::operator=;
++#ifndef __CUDACC__
+   using base::operator T*;
++#endif
+
+   T* fetch_add(std::ptrdiff_t arg) noexcept {
+     return atomic::fetch_add(arg, std::memory_order_relaxed);
diff --git a/Makefile b/Makefile
@@ -50,6 +50,14 @@ ifdef AZURESDK_ROOT_DIR
 CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR)
 endif
 
+ifdef CUDA_ARCHITECTURES
+CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)"
+endif
+
+ifdef CUDA_COMPILER
+CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)"
+endif
+
 # Use Ninja if available. If Ninja is used, pass through parallelism control flags.
 USE_NINJA ?= 1
 ifeq ($(USE_NINJA), 1)
@@ -108,6 +116,14 @@ minimal:				 #: Minimal build
 	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON"
 	$(MAKE) build BUILD_DIR=release
 
+gpu:						 #: Build with GPU support
+	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED"
+	$(MAKE) build BUILD_DIR=release
+
+gpu_debug:			 #: Build with debugging symbols and GPU support
+	$(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED"
+	$(MAKE) build BUILD_DIR=debug
+
 dwio:						#: Minimal build with dwio enabled.
 	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \
 																										    							  -DVELOX_BUILD_MINIMAL_WITH_DWIO=ON"

diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh
@@ -99,9 +99,9 @@ function get_cxx_flags {
         CPU_ARCH="arm64"
       fi
 
-    # On MacOs prevent the flood of translation visibility settings warnings.
-    ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden"
-    else [ "$OS" = "Linux" ];
+      # On MacOs prevent the flood of translation visibility settings warnings.
+      ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden"
+    elif [ "$OS" = "Linux" ]; then
 
       local CPU_CAPABILITIES
       CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}')
@@ -133,8 +133,11 @@ function get_cxx_flags {
     "aarch64")
       echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS"
     ;;
-  *)
-    echo -n "Architecture not supported!"
+
+    *)
+      echo "Architecture not supported: CPU_ARCH=$CPU_ARCH" 1>&2
+      exit 1
+    ;;
   esac
 
 }

diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h
@@ -2404,3 +2404,11 @@ struct fmt::formatter<facebook::velox::core::JoinType> : formatter<int> {
     return formatter<int>::format(static_cast<int>(s), ctx);
   }
 };
+
+template <>
+struct fmt::formatter<facebook::velox::core::AggregationNode::Step> : formatter<std::string> {
+  auto format(facebook::velox::core::AggregationNode::Step s, format_context& ctx) {
+    return formatter<std::string>::format(
+        facebook::velox::core::mapAggregationStepToName(s), ctx);
+  }
+};
diff --git a/velox/experimental/gpu/tests/CMakeLists.txt b/velox/experimental/gpu/tests/CMakeLists.txt
@@ -14,5 +14,3 @@
 
 add_executable(velox_gpu_hash_table_test HashTableTest.cu)
 target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags)
-set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES
-                                                           native)
diff --git a/velox/experimental/gpu/tests/HashTableTest.cu b/velox/experimental/gpu/tests/HashTableTest.cu
@@ -35,7 +35,7 @@ namespace {
 
 constexpr int kBlockSize = 256;
 
-__device__ uint32_t jenkinsRevMix32(uint32_t key) {
+[[maybe_unused]] __device__ uint32_t jenkinsRevMix32(uint32_t key) {
   key += (key << 12); // key *= (1 + (1 << 12))
   key ^= (key >> 22);
   key += (key << 4); // key *= (1 + (1 << 4))
@@ -299,6 +299,7 @@ __global__ void probe<true>(
       cmpMask = 0xffffffff;
     }
   end:
+    ;
   }
 }
 
@@ -628,6 +629,7 @@ __global__ void probePartitioned<true>(
       cmpMask = 0xffffffff;
     }
   end:
+    ;
   }
 }
 

diff --git a/velox/experimental/wave/common/CMakeLists.txt b/velox/experimental/wave/common/CMakeLists.txt
@@ -15,8 +15,6 @@
 add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp
                               Type.cpp)
 
-set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native)
-
 target_link_libraries(velox_wave_common velox_exception velox_common_base
                       velox_type)
 

diff --git a/velox/experimental/wave/common/Cuda.cu b/velox/experimental/wave/common/Cuda.cu
@@ -171,7 +171,7 @@ struct EventImpl {
 Event::Event(bool withTime) : hasTiming_(withTime) {
   event_ = std::make_unique<EventImpl>();
   CUDA_CHECK(
-      cudaEventCreate(&event_->event, withTime ? 0 : cudaEventDisableTiming));
+      cudaEventCreateWithFlags(&event_->event, withTime ? 0 : cudaEventDisableTiming));
 }
 
 Event::~Event() {}

diff --git a/velox/experimental/wave/common/Type.cpp b/velox/experimental/wave/common/Type.cpp
@@ -50,4 +50,33 @@ PhysicalType fromCpuType(const Type& type) {
   return ans;
 }
 
+std::string PhysicalType::kindString(Kind kind) {
+  switch (kind) {
+    case kInt8:
+      return "Int8";
+    case kInt16:
+      return "Int16";
+    case kInt32:
+      return "Int32";
+    case kInt64:
+      return "Int64";
+    case kInt128:
+      return "Int128";
+    case kFloat32:
+      return "Float32";
+    case kFloat64:
+      return "Float64";
+    case kString:
+      return "String";
+    case kArray:
+      return "Array";
+    case kMap:
+      return "Map";
+    case kRow:
+      return "Row";
+  }
+
+  VELOX_UNREACHABLE();
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/Type.h b/velox/experimental/wave/common/Type.h
@@ -17,6 +17,11 @@
 #pragma once
 
 #include <cstdint>
+#include <string>
+#include <fmt/format.h>
+#if FMT_VERSION >= 100100
+#include <fmt/std.h>
+#endif
 
 namespace facebook::velox {
 class Type;
@@ -40,8 +45,21 @@ struct PhysicalType {
   } kind;
   int32_t numChildren;
   PhysicalType** children;
+
+  static std::string kindString(Kind kind);
 };
 
 PhysicalType fromCpuType(const Type&);
 
 } // namespace facebook::velox::wave
+
+template <>
+struct fmt::formatter<facebook::velox::wave::PhysicalType::Kind>
+    : formatter<std::string> {
+  auto format(
+      facebook::velox::wave::PhysicalType::Kind s,
+      format_context& ctx) {
+    return formatter<std::string>::format(
+        facebook::velox::wave::PhysicalType::kindString(s), ctx);
+  }
+};
diff --git a/velox/experimental/wave/common/tests/BlockTest.cu b/velox/experimental/wave/common/tests/BlockTest.cu
@@ -11,7 +11,7 @@ __global__ void boolToIndices(
     int32_t** indices,
     int32_t* sizes,
     int64_t* times) {
-  extern __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[];
+  __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[1];
   int32_t idx = blockIdx.x;
   // Start cycle timer
   clock_t start = clock();
@@ -41,8 +41,8 @@ void BlockTestStream::testBoolToIndices(
 }
 
 __global__ void sum64(int64_t* numbers, int64_t* results) {
-  extern __shared__ __align__(
-      alignof(cub::BlockReduce<int64_t, 256>::TempStorage)) char smem[];
+  __shared__ __align__(
+      alignof(cub::BlockReduce<int64_t, 256>::TempStorage)) char smem[1];
   int32_t idx = blockIdx.x;
   blockSum<256>(
       [&]() { return numbers[idx * 256 + threadIdx.x]; }, smem, results);

diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt
@@ -15,9 +15,6 @@
 add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
                                       BlockTest.cpp BlockTest.cu)
 
-set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES
-                                                        native)
-
 add_test(velox_wave_common_test velox_wave_common_test)
 
 target_link_libraries(

diff --git a/velox/experimental/wave/exec/CMakeLists.txt b/velox/experimental/wave/exec/CMakeLists.txt
@@ -26,8 +26,6 @@ add_library(
   Wave.cpp
   Project.cpp)
 
-set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native)
-
 target_link_libraries(velox_wave_exec velox_wave_vector velox_wave_common
                       velox_exception velox_common_base velox_exec)
 

diff --git a/velox/experimental/wave/exec/ExprKernel.h b/velox/experimental/wave/exec/ExprKernel.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstdint>
+#include "velox/common/base/Exceptions.h"
 #include "velox/experimental/wave/common/Cuda.h"
 #include "velox/experimental/wave/exec/ErrorCode.h"
 #include "velox/experimental/wave/vector/Operand.h"
@@ -62,6 +63,45 @@ enum class OpCode {
 
 };
 
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const OpCode& opcode) {
+  switch (opcode) {
+    case OpCode::kFilter:
+      return out << "Filter";
+    case OpCode::kWrap:
+      return out << "Wrap";
+    case OpCode::kPlus:
+      return out << "Plus";
+    case OpCode::kMinus:
+      return out << "Minus";
+    case OpCode::kTimes:
+      return out << "Times";
+    case OpCode::kDivide:
+      return out << "Divide";
+    case OpCode::kEquals:
+      return out << "Equals";
+    case OpCode::kLT:
+      return out << "LT";
+    case OpCode::kLTE:
+      return out << "LTE";
+    case OpCode::kGT:
+      return out << "GT";
+    case OpCode::kGTE:
+      return out << "GTE";
+    case OpCode::kNE:
+      return out << "NE";
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+inline std::string mapOpCodeToName(const OpCode& opcode) {
+  std::stringstream ss;
+  ss << opcode;
+  return ss.str();
+}
+
 #define OP_MIX(op, t) \
   static_cast<OpCode>(static_cast<int32_t>(t) + 8 * static_cast<int32_t>(op))
 
@@ -151,3 +191,11 @@ class WaveKernelStream : public Stream {
 };
 
 } // namespace facebook::velox::wave
+
+template <>
+struct fmt::formatter<facebook::velox::wave::OpCode> : formatter<std::string> {
+  auto format(facebook::velox::wave::OpCode o, format_context& ctx) {
+    return formatter<std::string>::format(
+        facebook::velox::wave::mapOpCodeToName(o), ctx);
+  }
+};