forked from ROCm/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDispatchStub.h
158 lines (134 loc) · 4.71 KB
/
DispatchStub.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#pragma once
#include <ATen/ScalarType.h>
#include <c10/util/Exception.h>
#include <type_traits>
// Implements instruction set specific function dispatch.
//
// Kernels that may make use of specialized instruction sets (e.g. AVX) are
// compiled multiple times with different compiler flags (e.g. -mavx). A
// DispatchStub contains a table of function pointers for a kernel. At runtime,
// the fastest available kernel is chosen based on the features reported by
// cpuinfo.
//
// Example:
//
// In native/MyKernel.h:
// using fn_type = void(*)(const Tensor& x);
// DECLARE_DISPATCH(fn_type, stub);
//
// In native/MyKernel.cpp
// DEFINE_DISPATCH(stub);
//
// In native/cpu/MyKernel.cpp:
// namespace {
// // use anonymous namespace so that different cpu versions won't conflict
// void kernel(const Tensor& x) { ... }
// }
// REGISTER_DISPATCH(stub, &kernel);
//
// To call:
// stub(kCPU, tensor);
// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wundefined-var-template"
#endif
namespace at { namespace native {
enum class CPUCapability {
DEFAULT = 0,
AVX = 1,
AVX2 = 2,
NUM_OPTIONS
};
CPUCapability get_cpu_capability();
template <typename FnPtr, typename T>
struct CAFFE2_API DispatchStub;
template <typename rT, typename T, typename... Args>
struct CAFFE2_API DispatchStub<rT (*)(Args...), T> {
using FnPtr = rT (*) (Args...);
template <typename... ArgTypes>
rT operator()(DeviceType device_type, ArgTypes&&... args) {
if (device_type == DeviceType::CPU) {
if (!cpu_dispatch_ptr) {
cpu_dispatch_ptr = choose_cpu_impl();
}
return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
} else if (device_type == DeviceType::CUDA) {
AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
return (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
} else {
AT_ERROR("DispatchStub: unsupported device type", device_type);
}
}
FnPtr choose_cpu_impl() {
auto capability = static_cast<int>(get_cpu_capability());
(void)capability;
#ifdef HAVE_AVX2_CPU_DEFINITION
if (capability >= static_cast<int>(CPUCapability::AVX2)) {
AT_ASSERTM(AVX2, "DispatchStub: missing AVX2 kernel");
return AVX2;
}
#endif
#ifdef HAVE_AVX_CPU_DEFINITION
if (capability >= static_cast<int>(CPUCapability::AVX)) {
AT_ASSERTM(AVX, "DispatchStub: missing AVX kernel");
return AVX;
}
#endif
AT_ASSERTM(DEFAULT, "DispatchStub: missing default kernel");
return DEFAULT;
}
FnPtr cpu_dispatch_ptr = nullptr;
FnPtr cuda_dispatch_ptr = nullptr;
static FnPtr DEFAULT;
#ifdef HAVE_AVX_CPU_DEFINITION
static FnPtr AVX;
#endif
#ifdef HAVE_AVX2_CPU_DEFINITION
static FnPtr AVX2;
#endif
};
namespace {
template <typename FnPtr, typename T>
struct RegisterDispatch {
RegisterDispatch(DispatchStub<FnPtr, T>& stub, FnPtr value) {
stub.cuda_dispatch_ptr = value;
}
};
} // anonymous namespace
// Compiler will complain if you put things like std::tuple<Tensor, Tensor> in
// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g.,
// adding parentheses and using helper struct to get rid of the parentheses, do
// not work with MSVC. So do a `using`-declaration if you need to pass in such
// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
#define DECLARE_DISPATCH(fn, name) \
struct name : DispatchStub<fn, name> {}; \
extern CAFFE2_API struct name name
#define DEFINE_DISPATCH(name) struct name name
#define REGISTER_ARCH_DISPATCH(name, arch, fn) \
template <> decltype(fn) DispatchStub<decltype(fn), struct name>::arch = fn;
#ifdef HAVE_AVX_CPU_DEFINITION
#define REGISTER_AVX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX, fn)
#else
#define REGISTER_AVX_DISPATCH(name, fn)
#endif
#ifdef HAVE_AVX2_CPU_DEFINITION
#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn)
#else
#define REGISTER_AVX2_DISPATCH(name, fn)
#endif
#define REGISTER_NO_CPU_DISPATCH(name, fn_type) \
REGISTER_ARCH_DISPATCH(name, DEFAULT, static_cast<fn_type>(nullptr)) \
REGISTER_AVX_DISPATCH(name, static_cast<fn_type>(nullptr)) \
REGISTER_AVX2_DISPATCH(name, static_cast<fn_type>(nullptr))
#define REGISTER_CUDA_DISPATCH(name, fn) \
static RegisterDispatch<decltype(fn), struct name> name ## __register(name, fn);
#if defined(__CUDACC__)
#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
#elif defined(CPU_CAPABILITY)
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
#endif
}} // namespace at::native
#if defined(__clang__)
#pragma clang diagnostic pop
#endif