Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix building for GPU with CUDA 12.3.1 #8853

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMake/ResolveDependency.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ function(set_with_default var_name envvar_name default)
set(${var_name}
$ENV{${envvar_name}}
PARENT_SCOPE)
elseif(DEFINED ${envvar_name})
set(${var_name}
${${envvar_name}}
PARENT_SCOPE)
assignUser marked this conversation as resolved.
Show resolved Hide resolved
else()
set(${var_name}
${default}
Expand Down
10 changes: 8 additions & 2 deletions CMake/resolve_dependency_modules/folly/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,19 @@ message(STATUS "Building Folly from source")
if(gflags_SOURCE STREQUAL "BUNDLED")
set(glog_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-gflags-glog.patch)
endif()
if(VELOX_ENABLE_GPU)
set(cudacc_patch && git apply ${CMAKE_CURRENT_LIST_DIR}/folly-cudacc.patch)
endif()

set(VELOX_FOLLY_PATCH_COMMAND
git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch ${glog_patch}
${cudacc_patch})

FetchContent_Declare(
folly
URL ${VELOX_FOLLY_SOURCE_URL}
URL_HASH ${VELOX_FOLLY_BUILD_SHA256_CHECKSUM}
PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/folly-no-export.patch
${glog_patch})
PATCH_COMMAND ${VELOX_FOLLY_PATCH_COMMAND})

if(ON_APPLE_M1)
# folly will wrongly assume x86_64 if this is not set
Expand Down
68 changes: 68 additions & 0 deletions CMake/resolve_dependency_modules/folly/folly-cudacc.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) Rivos, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
--- a/folly/Conv.h
+++ b/folly/Conv.h
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw you submitted a PR to upstream these changes to folly. What's the status of it? Is it likely to get merged?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have very little visibility whether it will get merged and when. I'll make sure to follow up in the coming days as well.

@@ -286,8 +286,8 @@ namespace detail {
template <class... T>
using LastElement = type_pack_element_t<sizeof...(T) - 1, T...>;

-#ifdef _MSC_VER
-// MSVC can't quite figure out the LastElementImpl::call() stuff
+#if defined(_MSC_VER) || defined(__CUDACC__)
+// MSVC and NVCC can't quite figure out the LastElementImpl::call() stuff
// in the base implementation, so we have to use tuples instead,
// which result in significantly more templates being compiled,
// though the runtime performance is the same.
--- a/folly/synchronization/RelaxedAtomic.h
+++ b/folly/synchronization/RelaxedAtomic.h
@@ -98,7 +98,7 @@ struct relaxed_atomic_base : protected std::atomic<T> {
};

template <typename T>
-struct relaxed_atomic_integral_base : private relaxed_atomic_base<T> {
+struct relaxed_atomic_integral_base : protected relaxed_atomic_base<T> {
private:
using atomic = std::atomic<T>;
using base = relaxed_atomic_base<T>;
@@ -108,7 +108,9 @@ struct relaxed_atomic_integral_base : private relaxed_atomic_base<T> {

using base::relaxed_atomic_base;
using base::operator=;
+#ifndef __CUDACC__
using base::operator T;
+#endif
using base::compare_exchange_strong;
using base::compare_exchange_weak;
using base::exchange;
@@ -206,7 +208,9 @@ struct relaxed_atomic : detail::relaxed_atomic_base<T> {

using base::relaxed_atomic_base;
using base::operator=;
+#ifndef __CUDACC__
using base::operator T;
+#endif
};

template <typename T>
@@ -220,7 +224,9 @@ struct relaxed_atomic<T*> : detail::relaxed_atomic_base<T*> {

using detail::relaxed_atomic_base<T*>::relaxed_atomic_base;
using base::operator=;
+#ifndef __CUDACC__
using base::operator T*;
+#endif

T* fetch_add(std::ptrdiff_t arg) noexcept {
return atomic::fetch_add(arg, std::memory_order_relaxed);
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,15 @@ message("FINAL CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")

if(${VELOX_ENABLE_GPU})
enable_language(CUDA)
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "12.2")
message(FATAL_ERROR "CUDA version less than 12.2 not supported")
endif()
# Determine CUDA_ARCHITECTURES automatically.
cmake_policy(SET CMP0104 NEW)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# it will fail later in the build otherwise
message(FATAL_ERROR "-DCMAKE_CUDA_ARCHITECTURES= must be set")
endif()
if(CMAKE_BUILD_TYPE MATCHES Debug)
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
endif()
Expand Down
18 changes: 18 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ ifdef AZURESDK_ROOT_DIR
CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR)
endif

ifdef CUDA_ARCHITECTURES
CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)"
endif

ifdef CUDA_COMPILER
CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)"
endif

# Use Ninja if available. If Ninja is used, pass through parallelism control flags.
USE_NINJA ?= 1
ifeq ($(USE_NINJA), 1)
Expand Down Expand Up @@ -108,6 +116,16 @@ minimal: #: Minimal build
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON"
$(MAKE) build BUILD_DIR=release

# We specify -Dfolly_SOURCE=BUNDLED as we need to pick up the fix at CMake/resolve_dependency_modules/folly/folly-cudacc.patch
gpu: #: Build with GPU support
luhenry marked this conversation as resolved.
Show resolved Hide resolved
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED"
luhenry marked this conversation as resolved.
Show resolved Hide resolved
$(MAKE) build BUILD_DIR=release

# We specify -Dfolly_SOURCE=BUNDLED as we need to pick up the fix at CMake/resolve_dependency_modules/folly/folly-cudacc.patch
gpu_debug: #: Build with debugging symbols and GPU support
$(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -Dfolly_SOURCE=BUNDLED"
$(MAKE) build BUILD_DIR=debug

dwio: #: Minimal build with dwio enabled.
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \
-DVELOX_BUILD_MINIMAL_WITH_DWIO=ON"
Expand Down
13 changes: 8 additions & 5 deletions scripts/setup-helper-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ function get_cxx_flags {
CPU_ARCH="arm64"
fi

# On MacOs prevent the flood of translation visibility settings warnings.
ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden"
else [ "$OS" = "Linux" ];
# On MacOs prevent the flood of translation visibility settings warnings.
ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden"
elif [ "$OS" = "Linux" ]; then

local CPU_CAPABILITIES
CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}')
Expand Down Expand Up @@ -133,8 +133,11 @@ function get_cxx_flags {
"aarch64")
echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS"
;;
*)
echo -n "Architecture not supported!"

*)
echo "Architecture not supported: CPU_ARCH=$CPU_ARCH" 1>&2
exit 1
;;
esac

}
Expand Down
11 changes: 11 additions & 0 deletions velox/core/PlanNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -2404,3 +2404,14 @@ struct fmt::formatter<facebook::velox::core::JoinType> : formatter<int> {
return formatter<int>::format(static_cast<int>(s), ctx);
}
};

template <>
struct fmt::formatter<facebook::velox::core::AggregationNode::Step>
: formatter<std::string> {
auto format(
facebook::velox::core::AggregationNode::Step s,
format_context& ctx) {
return formatter<std::string>::format(
facebook::velox::core::mapAggregationStepToName(s), ctx);
}
};
2 changes: 0 additions & 2 deletions velox/experimental/gpu/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,3 @@

add_executable(velox_gpu_hash_table_test HashTableTest.cu)
target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags)
set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES
native)
4 changes: 3 additions & 1 deletion velox/experimental/gpu/tests/HashTableTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace {

constexpr int kBlockSize = 256;

__device__ uint32_t jenkinsRevMix32(uint32_t key) {
[[maybe_unused]] __device__ uint32_t jenkinsRevMix32(uint32_t key) {
key += (key << 12); // key *= (1 + (1 << 12))
key ^= (key >> 22);
key += (key << 4); // key *= (1 + (1 << 4))
Expand Down Expand Up @@ -299,6 +299,7 @@ __global__ void probe<true>(
cmpMask = 0xffffffff;
}
end:
;
}
}

Expand Down Expand Up @@ -628,6 +629,7 @@ __global__ void probePartitioned<true>(
cmpMask = 0xffffffff;
}
end:
;
}
}

Expand Down
2 changes: 0 additions & 2 deletions velox/experimental/wave/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp
Type.cpp)

set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native)

target_link_libraries(velox_wave_common velox_exception velox_common_base
velox_type)

Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/common/Cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ struct EventImpl {
Event::Event(bool withTime) : hasTiming_(withTime) {
event_ = std::make_unique<EventImpl>();
CUDA_CHECK(
cudaEventCreate(&event_->event, withTime ? 0 : cudaEventDisableTiming));
cudaEventCreateWithFlags(&event_->event, withTime ? 0 : cudaEventDisableTiming));
}

Event::~Event() {}
Expand Down
29 changes: 29 additions & 0 deletions velox/experimental/wave/common/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,33 @@ PhysicalType fromCpuType(const Type& type) {
return ans;
}

std::string_view PhysicalType::kindString(Kind kind) {
switch (kind) {
case kInt8:
return "Int8";
case kInt16:
return "Int16";
case kInt32:
return "Int32";
case kInt64:
return "Int64";
case kInt128:
return "Int128";
case kFloat32:
return "Float32";
case kFloat64:
return "Float64";
case kString:
return "String";
case kArray:
return "Array";
case kMap:
return "Map";
case kRow:
return "Row";
}

VELOX_UNREACHABLE();
}

} // namespace facebook::velox::wave
18 changes: 18 additions & 0 deletions velox/experimental/wave/common/Type.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@

#pragma once

#include <fmt/format.h>
#include <cstdint>
#include <string>
#if FMT_VERSION >= 100100
#include <fmt/std.h>
#endif

namespace facebook::velox {
class Type;
Expand All @@ -40,8 +45,21 @@ struct PhysicalType {
} kind;
int32_t numChildren;
PhysicalType** children;

static std::string_view kindString(Kind kind);
};

PhysicalType fromCpuType(const Type&);

} // namespace facebook::velox::wave

template <>
struct fmt::formatter<facebook::velox::wave::PhysicalType::Kind>
: formatter<std::string_view> {
auto format(
facebook::velox::wave::PhysicalType::Kind s,
format_context& ctx) {
return formatter<std::string_view>::format(
facebook::velox::wave::PhysicalType::kindString(s), ctx);
}
};
2 changes: 1 addition & 1 deletion velox/experimental/wave/common/tests/BlockTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ TEST_F(BlockTest, boolToIndices) {
std::vector<int32_t> referenceIndices(kNumFlags);
std::vector<int32_t> referenceSizes(kNumBlocks);
uint8_t* flags = flagsBuffer->as<uint8_t>();
for (auto i = 0; i < kNumFlags; ++i) {
for (size_t i = 0; i < kNumFlags; ++i) {
if ((i >> 8) % 17 == 0) {
flags[i] = 0;
} else if ((i >> 8) % 23 == 0) {
Expand Down
5 changes: 2 additions & 3 deletions velox/experimental/wave/common/tests/BlockTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ __global__ void boolToIndices(
int32_t** indices,
int32_t* sizes,
int64_t* times) {
extern __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[];
extern __shared__ char smem[];
int32_t idx = blockIdx.x;
// Start cycle timer
clock_t start = clock();
Expand Down Expand Up @@ -41,8 +41,7 @@ void BlockTestStream::testBoolToIndices(
}

__global__ void sum64(int64_t* numbers, int64_t* results) {
extern __shared__ __align__(
alignof(cub::BlockReduce<int64_t, 256>::TempStorage)) char smem[];
extern __shared__ char smem[];
int32_t idx = blockIdx.x;
blockSum<256>(
[&]() { return numbers[idx * 256 + threadIdx.x]; }, smem, results);
Expand Down
3 changes: 0 additions & 3 deletions velox/experimental/wave/common/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
BlockTest.cpp BlockTest.cu)

set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES
native)

add_test(velox_wave_common_test velox_wave_common_test)

target_link_libraries(
Expand Down
2 changes: 0 additions & 2 deletions velox/experimental/wave/exec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ add_library(
Wave.cpp
Project.cpp)

set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native)
assignUser marked this conversation as resolved.
Show resolved Hide resolved

target_link_libraries(velox_wave_exec velox_wave_vector velox_wave_common
velox_exception velox_common_base velox_exec)

Expand Down
Loading
Loading