-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcutil.h
101 lines (83 loc) · 3.23 KB
/
cutil.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Cuda utilities
#pragma once
#ifndef __CUDACC__
#define __host__
#define __device__
#define cuda_check(...) ((void)0)
#define IF_CUDA(...)
#define CUDA_OR_DIE(...) die("No CUDA")
#include "debug.h"
#else // __CUDACC__
#include "arith.h"
#include "debug.h"
#include "device.h"
#include "noncopyable.h"
#include <cuda.h>
#include <driver_types.h>
#include <memory>
#include <type_traits>
namespace mandelbrot {
using std::add_const_t;
using std::conditional_t;
using std::is_const_v;
using std::is_signed_v;
using std::min;
using std::remove_const_t;
using std::shared_ptr;
using std::type_identity_t;
using std::unique_ptr;
#define IF_CUDA(...) __VA_ARGS__
#define CUDA_OR_DIE(...) __VA_ARGS__
void __attribute__((noreturn, cold))
cuda_check_fail(cudaError_t code, const char* function, const char* file, unsigned int line,
const char* expression, const string& message);
#define cuda_check(code, ...) ({ \
auto _code = (code); \
if (_code != cudaSuccess) \
cuda_check_fail(_code, __PRETTY_FUNCTION__, __FILE__, __LINE__, #code, format(__VA_ARGS__)); })
// For now, we share one stream for simplicity
CUstream stream();
void cuda_sync();
// Unpack a Device<T>* into a T* for use in kernel invocations
template<class T> static inline T* device_get(Device<T>* p) { return reinterpret_cast<T*>(p); }
template<class T> static inline const T* device_get(const Device<T>* p) { return reinterpret_cast<const T*>(p); }
template<class C> static inline auto device_get(C&& c) { return device_get(c.data()); }
// Host to device and back.
// We use synchronous copies since our high performance code will be entirely GPU resident.
template<class T> static inline void host_to_device(span<Device<T>> dst, type_identity_t<span<const T>> src) {
slow_assert(dst.size() == src.size());
cuda_check(cudaMemcpy(device_get(dst.data()), src.data(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
}
template<class T> static inline void device_to_host(span<T> dst, type_identity_t<span<const Device<T>>> src) {
slow_assert(dst.size() == src.size());
cuda_check(cudaMemcpy(dst.data(), device_get(src.data()), src.size()*sizeof(T), cudaMemcpyDeviceToHost));
}
// One slow write
template<class T> static inline void single_host_to_device(Device<T>* dst, const T src) {
cuda_check(cudaMemcpy(device_get(dst), &src, sizeof(T), cudaMemcpyHostToDevice));
}
// For device to device, we copy asynchronously
template<class T> static inline void device_to_device(span<Device<T>> dst, type_identity_t<span<const Device<T>>> src) {
slow_assert(dst.size() == src.size());
cuda_check(cudaMemcpyAsync(device_get(dst.data()), device_get(src.data()),
src.size()*sizeof(T), cudaMemcpyDeviceToDevice, stream()));
}
// Number of SMs
int num_sms();
} // namespace mandelbrot
#endif // __CUDACC__
#include "device.h"
#include "debug.h"
#include "span.h"
namespace mandelbrot {
using std::type_identity_t;
// Host to host or device to device
template<class T> static inline void same_to_same(span<T> dst, type_identity_t<span<const T>> src) {
if constexpr (is_device<T>)
CUDA_OR_DIE(device_to_device(dst, src));
else {
slow_assert(dst.size() == src.size());
std::copy(src.data(), src.data() + src.size(), dst.data());
}
}
} // namespace mandelbrot