Skip to content

Commit

Permalink
Merge pull request #1761 from willend/openacc-multicore-gpu
Browse files Browse the repository at this point in the history
OpenACC multicore + gpu in same binary
  • Loading branch information
willend authored Nov 14, 2024
2 parents c046362 + 8454cb1 commit d652117
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 31 deletions.
6 changes: 3 additions & 3 deletions common/lib/share/mccode-r.c
Original file line number Diff line number Diff line change
Expand Up @@ -3432,7 +3432,7 @@ long sort_absorb_last(_class_particle* particles, _class_particle* pbuffer, long
// than l, resulting in idling. We should distribute lengths more evenly.

// step 1: sort sub-arrays
#pragma acc parallel loop present(particles, pbuffer)
#pragma acc parallel loop present(particles[0:buffer_len], pbuffer[0:buffer_len])
for (unsigned long tidx=0; tidx<SAL_THREADS; tidx++) {
long lo = l*tidx;
long loclen = l;
Expand Down Expand Up @@ -3477,7 +3477,7 @@ long sort_absorb_last(_class_particle* particles, _class_particle* pbuffer, long
}

// step 2: write non-absorbed sub-arrays to psorted/output from the left
#pragma acc parallel loop present(pbuffer)
#pragma acc parallel loop present(pbuffer[0:buffer_len])
for (unsigned long tidx=0; tidx<SAL_THREADS; tidx++) {
long j, k;
#pragma acc loop seq
Expand All @@ -3501,7 +3501,7 @@ long sort_absorb_last(_class_particle* particles, _class_particle* pbuffer, long
return accumlen;

// copy non-absorbed block
#pragma acc parallel loop present(particles)
#pragma acc parallel loop present(particles[0:buffer_len])
for (long tidx = 0; tidx < accumlen; tidx++) { // tidx: thread index
unsigned long randstate[7];
_class_particle sourcebuffer;
Expand Down
69 changes: 41 additions & 28 deletions mcstas/src/cogen.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -1584,14 +1584,12 @@ void def_trace_section(struct instr_def *instr)
cout("/* (Similar defines are available in each comp trace but */");
cout("/* those are not enough to handle external libs etc. ) */");
cout("#ifdef OPENACC");
cout("#ifndef MULTICORE");
cout("#define fprintf(stderr,...) printf(__VA_ARGS__)");
cout("#define sprintf(string,...) printf(__VA_ARGS__)");
cout("#define exit(...) noprintf()");
cout("#define strcmp(a,b) str_comp(a,b)");
cout("#define strlen(a) str_len(a)");
cout("#endif");
cout("#endif");

/* define SCATTERED, ABSORB and RESTORE macros for all TRACE */
coutf("#define %s (_particle->%s)", "SCATTERED", "_scattered");
Expand Down Expand Up @@ -1636,15 +1634,13 @@ void undef_trace_section(struct instr_def *instr)
coutf("#undef %s", statepars_all[i]);

cout("#ifdef OPENACC");
cout("#ifndef MULTICORE");
cout("#undef strlen");
cout("#undef strcmp");
cout("#undef exit");
cout("#undef printf");
cout("#undef sprintf");
cout("#undef fprintf");
cout("#endif");
cout("#endif");
cout("#undef SCATTERED");
cout("#undef RESTORE" );
#if MCCODE_PROJECT == 1 /* neutron */
Expand Down Expand Up @@ -1969,11 +1965,14 @@ int cogen_raytrace(struct instr_def *instr)
coutf(" unsigned long long loops;");
coutf(" loops = ceil((double)ncount/gpu_innerloop);");
coutf(" /* if on GPU, printf has been globally nullified, re-enable here */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #undef printf");
coutf(" #endif");
coutf(" #endif");
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
coutf("");
coutf(" #ifdef OPENACC");
coutf(" if (ncount>gpu_innerloop) {");
Expand All @@ -1993,11 +1992,14 @@ int cogen_raytrace(struct instr_def *instr)
coutf(" #endif");
coutf("");
coutf(" /* if on GPU, re-nullify printf */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #define printf(...) noprintf()");
coutf(" #endif");
coutf(" #endif");
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
coutf("");
coutf(" #pragma acc parallel loop num_gangs(numgangs) vector_length(vecsize)");
coutf(" for (unsigned long pidx=0 ; pidx < gpu_innerloop ; pidx++) {");
Expand All @@ -2016,11 +2018,14 @@ int cogen_raytrace(struct instr_def *instr)
coutf(" seed = seed+gpu_innerloop;");
coutf(" } /* CPU for */");
coutf(" /* if on GPU, printf has been globally nullified, re-enable here */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #undef printf");
coutf(" #endif");
coutf(" #endif");
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
coutf(" MPI_MASTER(");
coutf(" printf(\"*** TRACE end *** \\n\");");
coutf(" );");
Expand Down Expand Up @@ -2072,13 +2077,15 @@ int cogen_rt_funnel(struct instr_def *instr)
coutf(" unsigned long long loops;");
coutf("");
coutf(" /* if on GPU, printf has been globally nullified, re-enable here */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #undef printf");
coutf(" #endif");
coutf(" #endif");
coutf("");
/* Check if instrument uses JUMPS */
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
/* Check if instrument uses JUMPS */
liter = list_iterate(instr->complist);
while((comp = list_next(liter)) != NULL) {
if (list_len(comp->jump) > 0) { // JUMP ITERATE counters
Expand Down Expand Up @@ -2123,7 +2130,7 @@ int cogen_rt_funnel(struct instr_def *instr)

// init batch
coutf(" // init particles");
coutf(" #pragma acc parallel loop present(particles)");
coutf(" #pragma acc parallel loop present(particles[0:livebatchsize])");
coutf(" for (unsigned long pidx=0 ; pidx < livebatchsize ; pidx++) {");
coutf(" // generate particle state, set loop index and seed");
coutf(" particles[pidx] = mcgenstate();");
Expand Down Expand Up @@ -2174,7 +2181,11 @@ int cogen_rt_funnel(struct instr_def *instr)
if (first || (comp->cpuonly != cpuonly_last) || comp->split) {
coutf("");
if (comp->cpuonly == 0) {
coutf(" #pragma acc parallel loop present(particles)");
coutf(" #pragma acc parallel loop present(particles[0:livebatchsize])");
} else {
coutf(" #ifdef MULTICORE");
coutf(" #pragma acc parallel loop device_type(host)");
coutf(" #endif");
}
coutf(" for (unsigned long pidx=0 ; pidx < livebatchsize ; pidx++) {");
coutf(" _class_particle* _particle = &particles[pidx];");
Expand All @@ -2186,11 +2197,13 @@ int cogen_rt_funnel(struct instr_def *instr)

// coordinate transformations (wrt to PREVIOUS)
if (comp->skip_transform == 0) {
coutf("#ifndef MULTICORE");
coutf(" if (_%s_var._rotation_is_identity)", comp->name);
coutf(" coords_get("
"coords_add(coords_set(x,y,z), _%s_var._position_relative),"
"&x, &y, &z);", comp->name);
cout( " else");
coutf("#endif");
coutf(" mccoordschange(_%s_var._position_relative, _%s_var._rotation_relative, _particle);", comp->name, comp->name);
cout( " _particle_save = *_particle;");
}
Expand Down

0 comments on commit d652117

Please sign in to comment.