forked from ROCm/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
common_gpu.cc
343 lines (317 loc) · 10.9 KB
/
common_gpu.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#include "caffe2/core/common_gpu.h"
#include <atomic>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include "caffe2/core/asan.h"
#include "caffe2/core/common.h"
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
C10_DEFINE_bool(
caffe2_cuda_full_device_control,
false,
"If true, assume all the cudaSetDevice and cudaGetDevice calls will be "
"controlled by Caffe2, and non-Caffe2 code will ensure that the entry and "
"exit point has the same cuda device. Under the hood, Caffe2 will use "
"thread local variables to cache the device, in order to speed up set and "
"get device calls. This is an experimental feature that may have non "
"trivial side effects, so use it with care and only enable it if you are "
"absolutely sure. Also, this flag should not be changed after the program "
"initializes.");
namespace caffe2 {
int NumCudaDevices() {
if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
static bool first = true;
if (first) {
first = false;
std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
<< std::endl;
}
}
static int count = -1;
if (count < 0) {
auto err = cudaGetDeviceCount(&count);
switch (err) {
case cudaSuccess:
// Everything is good.
break;
case cudaErrorNoDevice:
count = 0;
break;
case cudaErrorInsufficientDriver:
LOG(WARNING) << "Insufficient cuda driver. Cannot use cuda.";
count = 0;
break;
case cudaErrorInitializationError:
LOG(WARNING) << "Cuda driver initialization failed, you might not "
"have a cuda gpu.";
count = 0;
break;
case cudaErrorUnknown:
LOG(ERROR) << "Found an unknown error - this may be due to an "
"incorrectly set up environment, e.g. changing env "
"variable CUDA_VISIBLE_DEVICES after program start. "
"I will set the available devices to be zero.";
count = 0;
break;
case cudaErrorMemoryAllocation:
#if CAFFE2_ASAN_ENABLED
// In ASAN mode, we know that a cudaErrorMemoryAllocation error will
// pop up.
LOG(ERROR) << "It is known that CUDA does not work well with ASAN. As "
"a result we will simply shut down CUDA support. If you "
"would like to use GPUs, turn off ASAN.";
count = 0;
break;
#else // CAFFE2_ASAN_ENABLED
// If we are not in ASAN mode and we get cudaErrorMemoryAllocation,
// this means that something is wrong before NumCudaDevices() call.
LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
"some cuda functions before calling NumCudaDevices() "
"that might have already set an error? Error: "
<< err;
break;
#endif // CAFFE2_ASAN_ENABLED
default:
LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
"some cuda functions before calling NumCudaDevices() "
"that might have already set an error? Error: "
<< err;
}
}
return count;
}
namespace {
int gDefaultGPUID = 0;
// Only used when FLAGS_caffe2_cuda_full_device_control is set true.
thread_local int gCurrentDevice = -1;
} // namespace
void SetDefaultGPUID(const int deviceid) {
CAFFE_ENFORCE_LT(
deviceid,
NumCudaDevices(),
"The default gpu id should be smaller than the number of gpus "
"on this machine: ",
deviceid,
" vs ",
NumCudaDevices());
gDefaultGPUID = deviceid;
}
int GetDefaultGPUID() { return gDefaultGPUID; }
int CaffeCudaGetDevice() {
if (FLAGS_caffe2_cuda_full_device_control) {
if (gCurrentDevice < 0) {
CUDA_ENFORCE(cudaGetDevice(&gCurrentDevice));
}
return gCurrentDevice;
} else {
int gpu_id = 0;
CUDA_ENFORCE(cudaGetDevice(&gpu_id));
return gpu_id;
}
}
void CaffeCudaSetDevice(const int id) {
if (FLAGS_caffe2_cuda_full_device_control) {
if (gCurrentDevice != id) {
CUDA_ENFORCE(cudaSetDevice(id));
}
gCurrentDevice = id;
} else {
CUDA_ENFORCE(cudaSetDevice(id));
}
}
int GetGPUIDForPointer(const void* ptr) {
cudaPointerAttributes attr;
cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
if (err == cudaErrorInvalidValue) {
// Occurs when the pointer is in the CPU address space that is
// unmanaged by CUDA; make sure the last error state is cleared,
// since it is persistent
err = cudaGetLastError();
CHECK(err == cudaErrorInvalidValue);
return -1;
}
// Otherwise, there must be no error
CUDA_ENFORCE(err);
if (attr.CAFFE2_CUDA_PTRATTR_MEMTYPE == cudaMemoryTypeHost) {
return -1;
}
return attr.device;
}
struct CudaDevicePropWrapper {
CudaDevicePropWrapper() : props(NumCudaDevices()) {
for (int i = 0; i < NumCudaDevices(); ++i) {
CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
}
}
vector<cudaDeviceProp> props;
};
const cudaDeviceProp& GetDeviceProperty(const int deviceid) {
// According to C++11 standard section 6.7, static local variable init is
// thread safe. See
// https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
// for details.
static CudaDevicePropWrapper props;
CAFFE_ENFORCE_LT(
deviceid,
NumCudaDevices(),
"The gpu id should be smaller than the number of gpus ",
"on this machine: ",
deviceid,
" vs ",
NumCudaDevices());
return props.props[deviceid];
}
void DeviceQuery(const int device) {
const cudaDeviceProp& prop = GetDeviceProperty(device);
std::stringstream ss;
ss << std::endl;
ss << "Device id: " << device << std::endl;
ss << "Major revision number: " << prop.major << std::endl;
ss << "Minor revision number: " << prop.minor << std::endl;
ss << "Name: " << prop.name << std::endl;
ss << "Total global memory: " << prop.totalGlobalMem << std::endl;
ss << "Total shared memory per block: " << prop.sharedMemPerBlock
<< std::endl;
ss << "Total registers per block: " << prop.regsPerBlock << std::endl;
ss << "Warp size: " << prop.warpSize << std::endl;
#ifndef __HIPCC__
ss << "Maximum memory pitch: " << prop.memPitch << std::endl;
#endif
ss << "Maximum threads per block: " << prop.maxThreadsPerBlock
<< std::endl;
ss << "Maximum dimension of block: "
<< prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
<< prop.maxThreadsDim[2] << std::endl;
ss << "Maximum dimension of grid: "
<< prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
<< prop.maxGridSize[2] << std::endl;
ss << "Clock rate: " << prop.clockRate << std::endl;
ss << "Total constant memory: " << prop.totalConstMem << std::endl;
#ifndef __HIPCC__
ss << "Texture alignment: " << prop.textureAlignment << std::endl;
ss << "Concurrent copy and execution: "
<< (prop.deviceOverlap ? "Yes" : "No") << std::endl;
#endif
ss << "Number of multiprocessors: " << prop.multiProcessorCount
<< std::endl;
#ifndef __HIPCC__
ss << "Kernel execution timeout: "
<< (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
#endif
LOG(INFO) << ss.str();
return;
}
bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
int gpu_count;
if (cudaGetDeviceCount(&gpu_count) != cudaSuccess) return false;
pattern->clear();
pattern->resize(gpu_count, vector<bool>(gpu_count, false));
for (int i = 0; i < gpu_count; ++i) {
for (int j = 0; j < gpu_count; ++j) {
int can_access = true;
if (i != j) {
if (cudaDeviceCanAccessPeer(&can_access, i, j)
!= cudaSuccess) {
return false;
}
}
(*pattern)[i][j] = static_cast<bool>(can_access);
}
}
return true;
}
bool TensorCoreAvailable() {
// requires CUDA 9.0 and above
#if CUDA_VERSION < 9000
return false;
#else
int device = CaffeCudaGetDevice();
auto& prop = GetDeviceProperty(device);
return prop.major >= 7;
#endif
}
const char* cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
#ifndef __HIPCC__
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
#endif
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
#if CUDA_VERSION >= 6000
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
#if CUDA_VERSION >= 6050
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
#endif // CUDA_VERSION >= 6050
#endif // CUDA_VERSION >= 6000
#ifdef __HIPCC__
case rocblas_status_invalid_size:
return "rocblas_status_invalid_size";
#endif
}
// To suppress compiler warning.
return "Unrecognized cublas error string";
}
const char* curandGetErrorString(curandStatus_t error) {
switch (error) {
case CURAND_STATUS_SUCCESS:
return "CURAND_STATUS_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
#ifdef __HIPCC__
case HIPRAND_STATUS_NOT_IMPLEMENTED:
return "HIPRAND_STATUS_NOT_IMPLEMENTED";
#endif
}
// To suppress compiler warning.
return "Unrecognized curand error string";
}
// Turn on the flag g_caffe2_has_cuda_linked to true for HasCudaRuntime()
// function.
namespace {
class CudaRuntimeFlagFlipper {
public:
CudaRuntimeFlagFlipper() {
internal::SetCudaRuntimeFlag();
}
};
static CudaRuntimeFlagFlipper g_flipper;
} // namespace
} // namespace caffe2