Skip to content

Commit

Permalink
Fix building for GPU with CUDA 12.3.1
Browse files Browse the repository at this point in the history
This change adds just enough support to build Velox with
VELOX_ENABLE_GPU=ON with the CUDA 12.3.1 toolchain
  • Loading branch information
luhenry committed Feb 26, 2024
1 parent 52ad7f5 commit c1a288c
Show file tree
Hide file tree
Showing 18 changed files with 213 additions and 26 deletions.
4 changes: 4 additions & 0 deletions CMake/ResolveDependency.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ function(set_with_default var_name envvar_name default)
set(${var_name}
$ENV{${envvar_name}}
PARENT_SCOPE)
elseif(DEFINED ${envvar_name})
set(${var_name}
${${envvar_name}}
PARENT_SCOPE)
else()
set(${var_name}
${default}
Expand Down
9 changes: 7 additions & 2 deletions CMake/resolve_dependency_modules/folly/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,18 @@ message(STATUS "Building Folly from source")
if(gflags_SOURCE STREQUAL "BUNDLED")
set(glog_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-gflags-glog.patch)
endif()
if(VELOX_ENABLE_GPU)
set(cudacc_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-cudacc.patch)
endif()

set(VELOX_FOLLY_PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch
${glog_patch} ${cudacc_patch})

FetchContent_Declare(
folly
URL ${VELOX_FOLLY_SOURCE_URL}
URL_HASH ${VELOX_FOLLY_BUILD_SHA256_CHECKSUM}
PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch
${glog_patch})
PATCH_COMMAND ${VELOX_FOLLY_PATCH_COMMAND})

if(ON_APPLE_M1)
# folly will wrongly assume x86_64 if this is not set
Expand Down
68 changes: 68 additions & 0 deletions CMake/resolve_dependency_modules/folly/folly-cudacc.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) Rivos, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
--- a/folly/Conv.h
+++ b/folly/Conv.h
@@ -286,8 +286,8 @@ namespace detail {
template <class... T>
using LastElement = type_pack_element_t<sizeof...(T) - 1, T...>;

-#ifdef _MSC_VER
-// MSVC can't quite figure out the LastElementImpl::call() stuff
+#if defined(_MSC_VER) || defined(__CUDACC__)
+// MSVC and NVCC can't quite figure out the LastElementImpl::call() stuff
// in the base implementation, so we have to use tuples instead,
// which result in significantly more templates being compiled,
// though the runtime performance is the same.
--- a/folly/synchronization/RelaxedAtomic.h
+++ b/folly/synchronization/RelaxedAtomic.h
@@ -98,7 +98,7 @@ struct relaxed_atomic_base : protected std::atomic<T> {
};

template <typename T>
-struct relaxed_atomic_integral_base : private relaxed_atomic_base<T> {
+struct relaxed_atomic_integral_base : protected relaxed_atomic_base<T> {
private:
using atomic = std::atomic<T>;
using base = relaxed_atomic_base<T>;
@@ -108,7 +108,9 @@ struct relaxed_atomic_integral_base : private relaxed_atomic_base<T> {

using base::relaxed_atomic_base;
using base::operator=;
+#ifndef __CUDACC__
using base::operator T;
+#endif
using base::compare_exchange_strong;
using base::compare_exchange_weak;
using base::exchange;
@@ -206,7 +208,9 @@ struct relaxed_atomic : detail::relaxed_atomic_base<T> {

using base::relaxed_atomic_base;
using base::operator=;
+#ifndef __CUDACC__
using base::operator T;
+#endif
};

template <typename T>
@@ -220,7 +224,9 @@ struct relaxed_atomic<T*> : detail::relaxed_atomic_base<T*> {

using detail::relaxed_atomic_base<T*>::relaxed_atomic_base;
using base::operator=;
+#ifndef __CUDACC__
using base::operator T*;
+#endif

T* fetch_add(std::ptrdiff_t arg) noexcept {
return atomic::fetch_add(arg, std::memory_order_relaxed);
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ ifdef AZURESDK_ROOT_DIR
CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR)
endif

ifdef CUDA_ARCHITECTURES
CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)"
endif

ifdef CUDA_COMPILER
CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)"
endif

# Use Ninja if available. If Ninja is used, pass through parallelism control flags.
USE_NINJA ?= 1
ifeq ($(USE_NINJA), 1)
Expand Down Expand Up @@ -108,6 +116,14 @@ minimal: #: Minimal build
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON"
$(MAKE) build BUILD_DIR=release

gpu: #: Build with GPU support
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED"
$(MAKE) build BUILD_DIR=release

gpu_debug: #: Build with debugging symbols and GPU support
$(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED"
$(MAKE) build BUILD_DIR=debug

dwio: #: Minimal build with dwio enabled.
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \
-DVELOX_BUILD_MINIMAL_WITH_DWIO=ON"
Expand Down
13 changes: 8 additions & 5 deletions scripts/setup-helper-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ function get_cxx_flags {
CPU_ARCH="arm64"
fi

# On MacOs prevent the flood of translation visibility settings warnings.
ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden"
else [ "$OS" = "Linux" ];
# On MacOs prevent the flood of translation visibility settings warnings.
ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden"
elif [ "$OS" = "Linux" ]; then

local CPU_CAPABILITIES
CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}')
Expand Down Expand Up @@ -133,8 +133,11 @@ function get_cxx_flags {
"aarch64")
echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS"
;;
*)
echo -n "Architecture not supported!"

*)
echo "Architecture not supported: CPU_ARCH=$CPU_ARCH" 1>&2
exit 1
;;
esac

}
Expand Down
8 changes: 8 additions & 0 deletions velox/core/PlanNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -2404,3 +2404,11 @@ struct fmt::formatter<facebook::velox::core::JoinType> : formatter<int> {
return formatter<int>::format(static_cast<int>(s), ctx);
}
};

template <>
struct fmt::formatter<facebook::velox::core::AggregationNode::Step> : formatter<std::string> {
auto format(facebook::velox::core::AggregationNode::Step s, format_context& ctx) {
return formatter<std::string>::format(
facebook::velox::core::mapAggregationStepToName(s), ctx);
}
};
2 changes: 0 additions & 2 deletions velox/experimental/gpu/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,3 @@

add_executable(velox_gpu_hash_table_test HashTableTest.cu)
target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags)
set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES
native)
4 changes: 3 additions & 1 deletion velox/experimental/gpu/tests/HashTableTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace {

constexpr int kBlockSize = 256;

__device__ uint32_t jenkinsRevMix32(uint32_t key) {
[[maybe_unused]] __device__ uint32_t jenkinsRevMix32(uint32_t key) {
key += (key << 12); // key *= (1 + (1 << 12))
key ^= (key >> 22);
key += (key << 4); // key *= (1 + (1 << 4))
Expand Down Expand Up @@ -299,6 +299,7 @@ __global__ void probe<true>(
cmpMask = 0xffffffff;
}
end:
;
}
}

Expand Down Expand Up @@ -628,6 +629,7 @@ __global__ void probePartitioned<true>(
cmpMask = 0xffffffff;
}
end:
;
}
}

Expand Down
2 changes: 0 additions & 2 deletions velox/experimental/wave/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp
Type.cpp)

set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native)

target_link_libraries(velox_wave_common velox_exception velox_common_base
velox_type)

Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/common/Cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ struct EventImpl {
Event::Event(bool withTime) : hasTiming_(withTime) {
event_ = std::make_unique<EventImpl>();
CUDA_CHECK(
cudaEventCreate(&event_->event, withTime ? 0 : cudaEventDisableTiming));
cudaEventCreateWithFlags(&event_->event, withTime ? 0 : cudaEventDisableTiming));
}

Event::~Event() {}
Expand Down
29 changes: 29 additions & 0 deletions velox/experimental/wave/common/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,33 @@ PhysicalType fromCpuType(const Type& type) {
return ans;
}

std::string PhysicalType::kindString(Kind kind) {
switch (kind) {
case kInt8:
return "Int8";
case kInt16:
return "Int16";
case kInt32:
return "Int32";
case kInt64:
return "Int64";
case kInt128:
return "Int128";
case kFloat32:
return "Float32";
case kFloat64:
return "Float64";
case kString:
return "String";
case kArray:
return "Array";
case kMap:
return "Map";
case kRow:
return "Row";
}

VELOX_UNREACHABLE();
}

} // namespace facebook::velox::wave
18 changes: 18 additions & 0 deletions velox/experimental/wave/common/Type.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
#pragma once

#include <cstdint>
#include <string>
#include <fmt/format.h>
#if FMT_VERSION >= 100100
#include <fmt/std.h>
#endif

namespace facebook::velox {
class Type;
Expand All @@ -40,8 +45,21 @@ struct PhysicalType {
} kind;
int32_t numChildren;
PhysicalType** children;

static std::string kindString(Kind kind);
};

PhysicalType fromCpuType(const Type&);

} // namespace facebook::velox::wave

template <>
struct fmt::formatter<facebook::velox::wave::PhysicalType::Kind>
: formatter<std::string> {
auto format(
facebook::velox::wave::PhysicalType::Kind s,
format_context& ctx) {
return formatter<std::string>::format(
facebook::velox::wave::PhysicalType::kindString(s), ctx);
}
};
6 changes: 3 additions & 3 deletions velox/experimental/wave/common/tests/BlockTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ __global__ void boolToIndices(
int32_t** indices,
int32_t* sizes,
int64_t* times) {
extern __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[];
__shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[1];
int32_t idx = blockIdx.x;
// Start cycle timer
clock_t start = clock();
Expand Down Expand Up @@ -41,8 +41,8 @@ void BlockTestStream::testBoolToIndices(
}

__global__ void sum64(int64_t* numbers, int64_t* results) {
extern __shared__ __align__(
alignof(cub::BlockReduce<int64_t, 256>::TempStorage)) char smem[];
__shared__ __align__(
alignof(cub::BlockReduce<int64_t, 256>::TempStorage)) char smem[1];
int32_t idx = blockIdx.x;
blockSum<256>(
[&]() { return numbers[idx * 256 + threadIdx.x]; }, smem, results);
Expand Down
3 changes: 0 additions & 3 deletions velox/experimental/wave/common/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
BlockTest.cpp BlockTest.cu)

set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES
native)

add_test(velox_wave_common_test velox_wave_common_test)

target_link_libraries(
Expand Down
2 changes: 0 additions & 2 deletions velox/experimental/wave/exec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ add_library(
Wave.cpp
Project.cpp)

set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native)

target_link_libraries(velox_wave_exec velox_wave_vector velox_wave_common
velox_exception velox_common_base velox_exec)

Expand Down
48 changes: 48 additions & 0 deletions velox/experimental/wave/exec/ExprKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <cstdint>
#include "velox/common/base/Exceptions.h"
#include "velox/experimental/wave/common/Cuda.h"
#include "velox/experimental/wave/exec/ErrorCode.h"
#include "velox/experimental/wave/vector/Operand.h"
Expand Down Expand Up @@ -62,6 +63,45 @@ enum class OpCode {

};

inline std::ostream& operator<<(
std::ostream& out,
const OpCode& opcode) {
switch (opcode) {
case OpCode::kFilter:
return out << "Filter";
case OpCode::kWrap:
return out << "Wrap";
case OpCode::kPlus:
return out << "Plus";
case OpCode::kMinus:
return out << "Minus";
case OpCode::kTimes:
return out << "Times";
case OpCode::kDivide:
return out << "Divide";
case OpCode::kEquals:
return out << "Equals";
case OpCode::kLT:
return out << "LT";
case OpCode::kLTE:
return out << "LTE";
case OpCode::kGT:
return out << "GT";
case OpCode::kGTE:
return out << "GTE";
case OpCode::kNE:
return out << "NE";
}

VELOX_UNREACHABLE();
}

inline std::string mapOpCodeToName(const OpCode& opcode) {
std::stringstream ss;
ss << opcode;
return ss.str();
}

#define OP_MIX(op, t) \
static_cast<OpCode>(static_cast<int32_t>(t) + 8 * static_cast<int32_t>(op))

Expand Down Expand Up @@ -151,3 +191,11 @@ class WaveKernelStream : public Stream {
};

} // namespace facebook::velox::wave

template <>
struct fmt::formatter<facebook::velox::wave::OpCode> : formatter<std::string> {
auto format(facebook::velox::wave::OpCode o, format_context& ctx) {
return formatter<std::string>::format(
facebook::velox::wave::mapOpCodeToName(o), ctx);
}
};
Loading

0 comments on commit c1a288c

Please sign in to comment.