forked from google/multichase
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernels.cu
64 lines (56 loc) · 1.67 KB
/
kernels.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include "arena.h"
#include "br_asm.h"
#include "cpu_util.h"
#include "expand.h"
#include "timer.h"
#include "util.h"
#include <stdio.h>
#include <cuda/std/atomic>
#ifdef __i386__
#define MAX_PARALLEL (6) // maximum number of chases in parallel
#else
#define MAX_PARALLEL (10)
#endif
typedef struct chase_t chase_t;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
typedef union {
char pad[AVOID_FALSE_SHARING];
struct {
unsigned thread_num; // which thread is this
unsigned count; // count of number of iterations
void *cycle[MAX_PARALLEL]; // initial address for the chases
const char *extra_args;
int dummy; // useful for confusing the compiler
const struct generate_chase_common_args *genchase_args;
size_t nr_threads;
const chase_t *chase;
void *flush_arena;
size_t cache_flush_size;
bool use_longer_chase;
int branch_chunk_size;
} x;
} per_thread_t;
__global__ void chase_simple_kernel(per_thread_t *t) {
void *p = t->x.cycle[0];
do {
x200(p = *(void **)p;)
// } while (__sync_add_and_fetch(&t->x.count, 200));
} while (((cuda::std::atomic<unsigned> *)&t->x.count)->fetch_add(200) + 200);
printf("CIAOOOOOO\n");
// we never actually reach here, but the compiler doesn't know that
t->x.dummy = (uintptr_t)p;
}
extern "C" {
void chase_simple_kernel_gpu(per_thread_t *t) {
chase_simple_kernel<<<1, 1>>>(t);
cudaDeviceSynchronize();
}
}