From 97fb9c18afaa83ebd65a09068cc68ee5886e327d Mon Sep 17 00:00:00 2001 From: Moritz Date: Fri, 17 Dec 2021 14:14:58 +0100 Subject: [PATCH 1/8] Hamiltonian_Heisenberg.cu: Split gradient functions into separate device functions --- core/src/engine/Hamiltonian_Heisenberg.cu | 100 +++++++++++++--------- 1 file changed, 61 insertions(+), 39 deletions(-) diff --git a/core/src/engine/Hamiltonian_Heisenberg.cu b/core/src/engine/Hamiltonian_Heisenberg.cu index ed3153fd0..df9222861 100644 --- a/core/src/engine/Hamiltonian_Heisenberg.cu +++ b/core/src/engine/Hamiltonian_Heisenberg.cu @@ -699,19 +699,27 @@ namespace Engine } } + __device__ Vector3 _orth(const Vector3 & grad, const Vector3 & spin) + { + return grad - grad.dot(spin) * spin; + } + __device__ void CU_Gradient_Zeeman_Cell(int icell, const int * atom_types, const int n_cell_atoms, const scalar * mu_s, const scalar external_field_magnitude, const Vector3 external_field_normal, Vector3 * gradient, size_t n_cells_total) + { + for (int ibasis=0; ibasis= 0) + { + gradient[ispin] -= magnitudes[ipair]*spins[jspin]; + } + } + } __global__ void CU_Gradient_Exchange(const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, int n_pairs, const Pair * pairs, const scalar * magnitudes, Vector3 * gradient, size_t size) { - int bc[3]={boundary_conditions[0],boundary_conditions[1],boundary_conditions[2]}; - int nc[3]={n_cells[0],n_cells[1],n_cells[2]}; - for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; icell < size; icell += blockDim.x * gridDim.x) { - for(auto ipair = 0; ipair < n_pairs; ++ipair) - { - int ispin = pairs[ipair].i + icell*n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, n_cell_atoms, atom_types, pairs[ipair]); - if (jspin >= 0) - { - gradient[ispin] -= magnitudes[ipair]*spins[jspin]; - } - } + CU_Gradient_Exchange_Cell(icell, spins, atom_types, boundary_conditions, n_cells, n_cell_atoms, n_pairs, pairs, magnitudes, gradient, size); } } void Hamiltonian_Heisenberg::Gradient_Exchange(const vectorfield & spins, vectorfield & gradient) @@ -776,26 +792,32 @@ namespace Engine CU_CHECK_AND_SYNC(); } + __device__ void CU_Gradient_DMI_Cell(int icell, const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, + int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Vector3 * gradient, size_t size) + { + const int bc[3]={boundary_conditions[0],boundary_conditions[1],boundary_conditions[2]}; + const int nc[3]={n_cells[0],n_cells[1],n_cells[2]}; + for(auto ipair = 0; ipair < n_pairs; ++ipair) + { + int ispin = pairs[ipair].i + icell*n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, n_cell_atoms, atom_types, pairs[ipair]); + if (jspin >= 0) + { + gradient[ispin] -= magnitudes[ipair]*spins[jspin].cross(normals[ipair]); + } + } + } + __global__ void CU_Gradient_DMI(const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Vector3 * gradient, size_t size) { - int bc[3]={boundary_conditions[0],boundary_conditions[1],boundary_conditions[2]}; - int nc[3]={n_cells[0],n_cells[1],n_cells[2]}; for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; icell < size; icell += blockDim.x * gridDim.x) { - for(auto ipair = 0; ipair < n_pairs; ++ipair) - { - int ispin = pairs[ipair].i + icell*n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, n_cell_atoms, atom_types, pairs[ipair]); - if (jspin >= 0) - { - gradient[ispin] -= magnitudes[ipair]*spins[jspin].cross(normals[ipair]); - } - } + CU_Gradient_DMI_Cell(icell, spins, atom_types, boundary_conditions, n_cells, n_cell_atoms, n_pairs, pairs, magnitudes, normals, gradient, size); } } void Hamiltonian_Heisenberg::Gradient_DMI(const vectorfield & spins, vectorfield & gradient) From 4aa2f690e4f25b489c1e91faccb18e80f867e136 Mon Sep 17 00:00:00 2001 From: Moritz Date: Fri, 17 Dec 2021 15:36:41 +0100 Subject: [PATCH 2/8] Refactor Hamiltonian_Heisenberg.cu: Reduced number of arguments in kernel calls. Now passes pointers to fields in Hamiltonian and geometry in helper struct. --- core/src/engine/Hamiltonian_Heisenberg.cu | 148 +++++++++++----------- 1 file changed, 77 insertions(+), 71 deletions(-) diff --git a/core/src/engine/Hamiltonian_Heisenberg.cu b/core/src/engine/Hamiltonian_Heisenberg.cu index df9222861..e76ce85f8 100644 --- a/core/src/engine/Hamiltonian_Heisenberg.cu +++ b/core/src/engine/Hamiltonian_Heisenberg.cu @@ -24,6 +24,21 @@ using Engine::Vectormath::cu_tupel_from_idx; namespace Engine { + struct Hamiltonian_Kernel_Data + { + // Helper structure to collect pointers that are needed in the cuda kernels, can be copy passed as argument in __global__ functions + const int * n_cells; + const int n_cell_atoms; + const int n_cells_total; + const int * atom_types; + const scalar * mu_s; + const int * boundary_conditions; + + Hamiltonian_Kernel_Data(const Hamiltonian_Heisenberg & ham) + : n_cells(ham.geometry->n_cells.data()), n_cell_atoms(ham.geometry->n_cell_atoms), n_cells_total(ham.geometry->n_cells_total), atom_types( ham.geometry->atom_types.data()), mu_s(ham.geometry->mu_s.data()), boundary_conditions(ham.boundary_conditions.data()) + { } + }; + // Construct a Heisenberg Hamiltonian with pairs Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( scalar external_field_magnitude, Vector3 external_field_normal, @@ -248,38 +263,38 @@ namespace Engine } - __global__ void CU_E_Zeeman(const Vector3 * spins, const int * atom_types, const int n_cell_atoms, const scalar * mu_s, const scalar external_field_magnitude, const Vector3 external_field_normal, scalar * Energy, size_t n_cells_total) + __global__ void CU_E_Zeeman(const Vector3 * spins, scalar * Energy, const scalar external_field_magnitude, const Vector3 external_field_normal, Hamiltonian_Kernel_Data data) { for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < n_cells_total; + icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - for (int ibasis=0; ibasisn_cells_total; - CU_E_Zeeman<<<(size+1023)/1024, 1024>>>(spins.data(), this->geometry->atom_types.data(), geometry->n_cell_atoms, geometry->mu_s.data(), this->external_field_magnitude, this->external_field_normal, Energy.data(), size); + CU_E_Zeeman<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), this->external_field_magnitude, this->external_field_normal, Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __global__ void CU_E_Anisotropy(const Vector3 * spins, const int * atom_types, const int n_cell_atoms, const int n_anisotropies, const int * anisotropy_indices, const scalar * anisotropy_magnitude, const Vector3 * anisotropy_normal, scalar * Energy, size_t n_cells_total) + __global__ void CU_E_Anisotropy(const Vector3 * spins, scalar * Energy, const int n_anisotropies, const int * anisotropy_indices, const scalar * anisotropy_magnitude, const Vector3 * anisotropy_normal, Hamiltonian_Kernel_Data data) { for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < n_cells_total; + icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { for (int iani=0; ianin_cells_total; - CU_E_Anisotropy<<<(size+1023)/1024, 1024>>>(spins.data(), this->geometry->atom_types.data(), this->geometry->n_cell_atoms, this->anisotropy_indices.size(), this->anisotropy_indices.data(), this->anisotropy_magnitudes.data(), this->anisotropy_normals.data(), Energy.data(), size); + CU_E_Anisotropy<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), this->anisotropy_indices.size(), this->anisotropy_indices.data(), this->anisotropy_magnitudes.data(), this->anisotropy_normals.data(), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __global__ void CU_E_Exchange(const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, - int n_pairs, const Pair * pairs, const scalar * magnitudes, scalar * Energy, size_t size) + __global__ void CU_E_Exchange(const Vector3 * spins, scalar * Energy, int n_pairs, const Pair * pairs, const scalar * magnitudes, Hamiltonian_Kernel_Data data) { - int bc[3]={boundary_conditions[0],boundary_conditions[1],boundary_conditions[2]}; - int nc[3]={n_cells[0],n_cells[1],n_cells[2]}; + int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; + int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < size; + icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { for(auto ipair = 0; ipair < n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell*n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, n_cell_atoms, atom_types, pairs[ipair]); + int ispin = pairs[ipair].i + icell* data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); if (jspin >= 0) { Energy[ispin] -= 0.5 * magnitudes[ipair] * spins[ispin].dot(spins[jspin]); @@ -316,26 +330,25 @@ namespace Engine void Hamiltonian_Heisenberg::E_Exchange(const vectorfield & spins, scalarfield & Energy) { int size = geometry->n_cells_total; - CU_E_Exchange<<<(size+1023)/1024, 1024>>>(spins.data(), this->geometry->atom_types.data(), boundary_conditions.data(), geometry->n_cells.data(), geometry->n_cell_atoms, - this->exchange_pairs.size(), this->exchange_pairs.data(), this->exchange_magnitudes.data(), Energy.data(), size); + CU_E_Exchange<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), + this->exchange_pairs.size(), this->exchange_pairs.data(), this->exchange_magnitudes.data(), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __global__ void CU_E_DMI(const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, - int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, scalar * Energy, size_t size) + __global__ void CU_E_DMI(const Vector3 * spins, scalar * Energy, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Hamiltonian_Kernel_Data data) { - int bc[3]={boundary_conditions[0],boundary_conditions[1],boundary_conditions[2]}; - int nc[3]={n_cells[0],n_cells[1],n_cells[2]}; + int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; + int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < size; + icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { for(auto ipair = 0; ipair < n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell*n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, n_cell_atoms, atom_types, pairs[ipair]); + int ispin = pairs[ipair].i + icell*data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); if (jspin >= 0) { Energy[ispin] -= 0.5 * magnitudes[ipair] * normals[ipair].dot(spins[ispin].cross(spins[jspin])); @@ -346,8 +359,8 @@ namespace Engine void Hamiltonian_Heisenberg::E_DMI(const vectorfield & spins, scalarfield & Energy) { int size = geometry->n_cells_total; - CU_E_DMI<<<(size+1023)/1024, 1024>>>(spins.data(), this->geometry->atom_types.data(), boundary_conditions.data(), geometry->n_cells.data(), geometry->n_cell_atoms, - this->dmi_pairs.size(), this->dmi_pairs.data(), this->dmi_magnitudes.data(), this->dmi_normals.data(), Energy.data(), size); + CU_E_DMI<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), + this->dmi_pairs.size(), this->dmi_pairs.data(), this->dmi_magnitudes.data(), this->dmi_normals.data(), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } @@ -704,69 +717,68 @@ namespace Engine return grad - grad.dot(spin) * spin; } - __device__ void CU_Gradient_Zeeman_Cell(int icell, const int * atom_types, const int n_cell_atoms, const scalar * mu_s, const scalar external_field_magnitude, const Vector3 external_field_normal, Vector3 * gradient, size_t n_cells_total) + __device__ void CU_Gradient_Zeeman_Cell(int icell, Vector3 * gradient, const scalar external_field_magnitude, const Vector3 external_field_normal, const Hamiltonian_Kernel_Data & data) { - for (int ibasis=0; ibasisn_cells_total; - CU_Gradient_Zeeman<<<(size+1023)/1024, 1024>>>( this->geometry->atom_types.data(), geometry->n_cell_atoms, geometry->mu_s.data(), this->external_field_magnitude, this->external_field_normal, gradient.data(), size ); + CU_Gradient_Zeeman<<<(size+1023)/1024, 1024>>>( gradient.data(), this->external_field_magnitude, this->external_field_normal, Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __device__ void CU_Gradient_Anisotropy_Cell(int icell, const Vector3 * spins, const int * atom_types, const int n_cell_atoms, const int n_anisotropies, const int * anisotropy_indices, const scalar * anisotropy_magnitude, const Vector3 * anisotropy_normal, Vector3 * gradient, size_t n_cells_total) + __device__ void CU_Gradient_Anisotropy_Cell(int icell, const Vector3 * spins, Vector3 * gradient, const int n_anisotropies, const int * anisotropy_indices, const scalar * anisotropy_magnitude, const Vector3 * anisotropy_normal, const Hamiltonian_Kernel_Data & data) { for (int iani=0; ianin_cells_total; - CU_Gradient_Anisotropy<<<(size+1023)/1024, 1024>>>( spins.data(), this->geometry->atom_types.data(), this->geometry->n_cell_atoms, this->anisotropy_indices.size(), this->anisotropy_indices.data(), this->anisotropy_magnitudes.data(), this->anisotropy_normals.data(), gradient.data(), size ); + CU_Gradient_Anisotropy<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), this->anisotropy_indices.size(), this->anisotropy_indices.data(), this->anisotropy_magnitudes.data(), this->anisotropy_normals.data(), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __device__ void CU_Gradient_Exchange_Cell(int icell, const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, - int n_pairs, const Pair * pairs, const scalar * magnitudes, Vector3 * gradient, size_t size) + __device__ void CU_Gradient_Exchange_Cell(int icell, const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Hamiltonian_Kernel_Data & data) { - const int bc[3]={boundary_conditions[0],boundary_conditions[1],boundary_conditions[2]}; - const int nc[3]={n_cells[0],n_cells[1],n_cells[2]}; + const int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; + const int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; for(auto ipair = 0; ipair < n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell*n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, n_cell_atoms, atom_types, pairs[ipair]); + int ispin = pairs[ipair].i + icell*data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); if (jspin >= 0) { gradient[ispin] -= magnitudes[ipair]*spins[jspin]; @@ -774,34 +786,31 @@ namespace Engine } } - __global__ void CU_Gradient_Exchange(const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, - int n_pairs, const Pair * pairs, const scalar * magnitudes, Vector3 * gradient, size_t size) + __global__ void CU_Gradient_Exchange(const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, Hamiltonian_Kernel_Data data) { for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < size; + icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - CU_Gradient_Exchange_Cell(icell, spins, atom_types, boundary_conditions, n_cells, n_cell_atoms, n_pairs, pairs, magnitudes, gradient, size); + CU_Gradient_Exchange_Cell(icell, spins, gradient, n_pairs, pairs, magnitudes, data); } } void Hamiltonian_Heisenberg::Gradient_Exchange(const vectorfield & spins, vectorfield & gradient) { int size = geometry->n_cells_total; - CU_Gradient_Exchange<<<(size+1023)/1024, 1024>>>( spins.data(), this->geometry->atom_types.data(), boundary_conditions.data(), geometry->n_cells.data(), geometry->n_cell_atoms, - this->exchange_pairs.size(), this->exchange_pairs.data(), this->exchange_magnitudes.data(), gradient.data(), size ); + CU_Gradient_Exchange<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), this->exchange_pairs.size(), this->exchange_pairs.data(), this->exchange_magnitudes.data(), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __device__ void CU_Gradient_DMI_Cell(int icell, const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, - int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Vector3 * gradient, size_t size) + __device__ void CU_Gradient_DMI_Cell(int icell, const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, const Hamiltonian_Kernel_Data & data) { - const int bc[3]={boundary_conditions[0],boundary_conditions[1],boundary_conditions[2]}; - const int nc[3]={n_cells[0],n_cells[1],n_cells[2]}; + const int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; + const int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; for(auto ipair = 0; ipair < n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell*n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, n_cell_atoms, atom_types, pairs[ipair]); + int ispin = pairs[ipair].i + icell*data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); if (jspin >= 0) { gradient[ispin] -= magnitudes[ipair]*spins[jspin].cross(normals[ipair]); @@ -809,22 +818,19 @@ namespace Engine } } - __global__ void CU_Gradient_DMI(const Vector3 * spins, const int * atom_types, const int * boundary_conditions, const int * n_cells, int n_cell_atoms, - int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Vector3 * gradient, size_t size) + __global__ void CU_Gradient_DMI(const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Hamiltonian_Kernel_Data data) { - for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < size; + icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - CU_Gradient_DMI_Cell(icell, spins, atom_types, boundary_conditions, n_cells, n_cell_atoms, n_pairs, pairs, magnitudes, normals, gradient, size); + CU_Gradient_DMI_Cell(icell, spins, gradient, n_pairs, pairs, magnitudes, normals, data); } } void Hamiltonian_Heisenberg::Gradient_DMI(const vectorfield & spins, vectorfield & gradient) { int size = geometry->n_cells_total; - CU_Gradient_DMI<<<(size+1023)/1024, 1024>>>( spins.data(), this->geometry->atom_types.data(), boundary_conditions.data(), geometry->n_cells.data(), geometry->n_cell_atoms, - this->dmi_pairs.size(), this->dmi_pairs.data(), this->dmi_magnitudes.data(), this->dmi_normals.data(), gradient.data(), size ); + CU_Gradient_DMI<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), this->dmi_pairs.size(), this->dmi_pairs.data(), this->dmi_magnitudes.data(), this->dmi_normals.data(), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } From 8f25bf3147c56a08fa83e0691515936a561b8872 Mon Sep 17 00:00:00 2001 From: Moritz Date: Fri, 17 Dec 2021 16:06:35 +0100 Subject: [PATCH 3/8] Hamiltonian_Heisenberg.cu: Reduced of arguments in kernel calls further. Now passes pointers to interaction data in helper structs. --- core/src/engine/Hamiltonian_Heisenberg.cu | 146 ++++++++++++++-------- 1 file changed, 94 insertions(+), 52 deletions(-) diff --git a/core/src/engine/Hamiltonian_Heisenberg.cu b/core/src/engine/Hamiltonian_Heisenberg.cu index e76ce85f8..793273421 100644 --- a/core/src/engine/Hamiltonian_Heisenberg.cu +++ b/core/src/engine/Hamiltonian_Heisenberg.cu @@ -1,4 +1,4 @@ -#ifdef SPIRIT_USE_CUDA +// #ifdef SPIRIT_USE_CUDA #include #include @@ -39,6 +39,49 @@ namespace Engine { } }; + struct External_Field_Data + { + // Helper structure to collect pointers that are needed in the cuda kernels, can be copy passed as argument in __global__ functions + const scalar magnitude; + const Vector3 normal; + + External_Field_Data(const Hamiltonian_Heisenberg & ham) + : normal(ham.external_field_normal), magnitude(ham.external_field_magnitude) + { } + }; + + struct Anisotropy_Data + { + // Helper structure to collect pointers that are needed in the cuda kernels, can be copy passed as argument in __global__ functions + const int n_anisotropies; + const int * indices; + const scalar * magnitude; + const Vector3 * normal; + + Anisotropy_Data(const Hamiltonian_Heisenberg & ham) + : n_anisotropies(ham.anisotropy_indices.size()), indices(ham.anisotropy_indices.data()), magnitude(ham.anisotropy_magnitudes.data()), normal(ham.anisotropy_normals.data()) + { } + }; + + struct Exchange_Data + { + const scalar * magnitudes; + const Pair * pairs; + const int n_pairs; + + Exchange_Data(const Hamiltonian_Heisenberg & ham) : magnitudes(ham.exchange_magnitudes.data()), pairs(ham.exchange_pairs.data()), n_pairs(ham.exchange_pairs.size()) {} + }; + + struct DMI_Data + { + const scalar * magnitudes; + const Pair * pairs; + const Vector3 * normals; + const int n_pairs; + + DMI_Data(const Hamiltonian_Heisenberg & ham) : magnitudes(ham.dmi_magnitudes.data()), pairs(ham.dmi_pairs.data()), normals(ham.dmi_normals.data()), n_pairs(ham.exchange_pairs.size()) {} + }; + // Construct a Heisenberg Hamiltonian with pairs Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( scalar external_field_magnitude, Vector3 external_field_normal, @@ -263,7 +306,7 @@ namespace Engine } - __global__ void CU_E_Zeeman(const Vector3 * spins, scalar * Energy, const scalar external_field_magnitude, const Vector3 external_field_normal, Hamiltonian_Kernel_Data data) + __global__ void CU_E_Zeeman(const Vector3 * spins, scalar * Energy, External_Field_Data external_field, Hamiltonian_Kernel_Data data) { for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; icell < data.n_cells_total; @@ -273,41 +316,41 @@ namespace Engine { int ispin = data.n_cell_atoms * icell + ibasis; if ( cu_check_atom_type(data.atom_types[ispin]) ) - Energy[ispin] -= data.mu_s[ispin] * external_field_magnitude * external_field_normal.dot(spins[ispin]); + Energy[ispin] -= data.mu_s[ispin] * external_field.magnitude * external_field.normal.dot(spins[ispin]); } } } void Hamiltonian_Heisenberg::E_Zeeman(const vectorfield & spins, scalarfield & Energy) { int size = geometry->n_cells_total; - CU_E_Zeeman<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), this->external_field_magnitude, this->external_field_normal, Hamiltonian_Kernel_Data(*this)); + CU_E_Zeeman<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), External_Field_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __global__ void CU_E_Anisotropy(const Vector3 * spins, scalar * Energy, const int n_anisotropies, const int * anisotropy_indices, const scalar * anisotropy_magnitude, const Vector3 * anisotropy_normal, Hamiltonian_Kernel_Data data) + __global__ void CU_E_Anisotropy(const Vector3 * spins, scalar * Energy, Anisotropy_Data anisotropy, Hamiltonian_Kernel_Data data) { for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - for (int iani=0; ianin_cells_total; - CU_E_Anisotropy<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), this->anisotropy_indices.size(), this->anisotropy_indices.data(), this->anisotropy_magnitudes.data(), this->anisotropy_normals.data(), Hamiltonian_Kernel_Data(*this)); + CU_E_Anisotropy<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), Anisotropy_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __global__ void CU_E_Exchange(const Vector3 * spins, scalar * Energy, int n_pairs, const Pair * pairs, const scalar * magnitudes, Hamiltonian_Kernel_Data data) + __global__ void CU_E_Exchange(const Vector3 * spins, scalar * Energy, Exchange_Data exchange, Hamiltonian_Kernel_Data data) { int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; @@ -316,13 +359,13 @@ namespace Engine icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - for(auto ipair = 0; ipair < n_pairs; ++ipair) + for(auto ipair = 0; ipair < exchange.n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell* data.n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); + int ispin = exchange.pairs[ipair].i + icell* data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, exchange.pairs[ipair]); if (jspin >= 0) { - Energy[ispin] -= 0.5 * magnitudes[ipair] * spins[ispin].dot(spins[jspin]); + Energy[ispin] -= 0.5 * exchange.magnitudes[ipair] * spins[ispin].dot(spins[jspin]); } } } @@ -330,13 +373,12 @@ namespace Engine void Hamiltonian_Heisenberg::E_Exchange(const vectorfield & spins, scalarfield & Energy) { int size = geometry->n_cells_total; - CU_E_Exchange<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), - this->exchange_pairs.size(), this->exchange_pairs.data(), this->exchange_magnitudes.data(), Hamiltonian_Kernel_Data(*this)); + CU_E_Exchange<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), Exchange_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __global__ void CU_E_DMI(const Vector3 * spins, scalar * Energy, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Hamiltonian_Kernel_Data data) + __global__ void CU_E_DMI(const Vector3 * spins, scalar * Energy, DMI_Data dmi, Hamiltonian_Kernel_Data data) { int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; @@ -345,13 +387,13 @@ namespace Engine icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - for(auto ipair = 0; ipair < n_pairs; ++ipair) + for(auto ipair = 0; ipair < dmi.n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell*data.n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); + int ispin = dmi.pairs[ipair].i + icell*data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, dmi.pairs[ipair]); if (jspin >= 0) { - Energy[ispin] -= 0.5 * magnitudes[ipair] * normals[ipair].dot(spins[ispin].cross(spins[jspin])); + Energy[ispin] -= 0.5 * dmi.magnitudes[ipair] * dmi.normals[ipair].dot(spins[ispin].cross(spins[jspin])); } } } @@ -360,7 +402,7 @@ namespace Engine { int size = geometry->n_cells_total; CU_E_DMI<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), - this->dmi_pairs.size(), this->dmi_pairs.data(), this->dmi_magnitudes.data(), this->dmi_normals.data(), Hamiltonian_Kernel_Data(*this)); + DMI_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } @@ -717,120 +759,120 @@ namespace Engine return grad - grad.dot(spin) * spin; } - __device__ void CU_Gradient_Zeeman_Cell(int icell, Vector3 * gradient, const scalar external_field_magnitude, const Vector3 external_field_normal, const Hamiltonian_Kernel_Data & data) + __device__ void CU_Gradient_Zeeman_Cell(int icell, Vector3 * gradient, const External_Field_Data & external_field, const Hamiltonian_Kernel_Data & data) { for (int ibasis=0; ibasisn_cells_total; - CU_Gradient_Zeeman<<<(size+1023)/1024, 1024>>>( gradient.data(), this->external_field_magnitude, this->external_field_normal, Hamiltonian_Kernel_Data(*this)); + CU_Gradient_Zeeman<<<(size+1023)/1024, 1024>>>( gradient.data(), External_Field_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __device__ void CU_Gradient_Anisotropy_Cell(int icell, const Vector3 * spins, Vector3 * gradient, const int n_anisotropies, const int * anisotropy_indices, const scalar * anisotropy_magnitude, const Vector3 * anisotropy_normal, const Hamiltonian_Kernel_Data & data) + __device__ void CU_Gradient_Anisotropy_Cell(int icell, const Vector3 * spins, Vector3 * gradient, const Anisotropy_Data & anisotropy, const Hamiltonian_Kernel_Data & data) { - for (int iani=0; ianin_cells_total; - CU_Gradient_Anisotropy<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), this->anisotropy_indices.size(), this->anisotropy_indices.data(), this->anisotropy_magnitudes.data(), this->anisotropy_normals.data(), Hamiltonian_Kernel_Data(*this)); + CU_Gradient_Anisotropy<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), Anisotropy_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __device__ void CU_Gradient_Exchange_Cell(int icell, const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Hamiltonian_Kernel_Data & data) + __device__ void CU_Gradient_Exchange_Cell(int icell, const Vector3 * spins, Vector3 * gradient, const Exchange_Data & exchange, const Hamiltonian_Kernel_Data & data) { const int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; const int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; - for(auto ipair = 0; ipair < n_pairs; ++ipair) + for(auto ipair = 0; ipair < exchange.n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell*data.n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); + int ispin = exchange.pairs[ipair].i + icell*data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, exchange.pairs[ipair]); if (jspin >= 0) { - gradient[ispin] -= magnitudes[ipair]*spins[jspin]; + gradient[ispin] -= exchange.magnitudes[ipair]*spins[jspin]; } } } - __global__ void CU_Gradient_Exchange(const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, Hamiltonian_Kernel_Data data) + __global__ void CU_Gradient_Exchange(const Vector3 * spins, Vector3 * gradient, Exchange_Data exchange, Hamiltonian_Kernel_Data data) { for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - CU_Gradient_Exchange_Cell(icell, spins, gradient, n_pairs, pairs, magnitudes, data); + CU_Gradient_Exchange_Cell(icell, spins, gradient, exchange, data); } } void Hamiltonian_Heisenberg::Gradient_Exchange(const vectorfield & spins, vectorfield & gradient) { int size = geometry->n_cells_total; - CU_Gradient_Exchange<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), this->exchange_pairs.size(), this->exchange_pairs.data(), this->exchange_magnitudes.data(), Hamiltonian_Kernel_Data(*this)); + CU_Gradient_Exchange<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), Exchange_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } - __device__ void CU_Gradient_DMI_Cell(int icell, const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, const Hamiltonian_Kernel_Data & data) + __device__ void CU_Gradient_DMI_Cell(int icell, const Vector3 * spins, Vector3 * gradient, const DMI_Data & dmi, const Hamiltonian_Kernel_Data & data) { const int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; const int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; - for(auto ipair = 0; ipair < n_pairs; ++ipair) + for(auto ipair = 0; ipair < dmi.n_pairs; ++ipair) { - int ispin = pairs[ipair].i + icell*data.n_cell_atoms; - int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, pairs[ipair]); + int ispin = dmi.pairs[ipair].i + icell*data.n_cell_atoms; + int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, dmi.pairs[ipair]); if (jspin >= 0) { - gradient[ispin] -= magnitudes[ipair]*spins[jspin].cross(normals[ipair]); + gradient[ispin] -= dmi.magnitudes[ipair]*spins[jspin].cross(dmi.normals[ipair]); } } } - __global__ void CU_Gradient_DMI(const Vector3 * spins, Vector3 * gradient, int n_pairs, const Pair * pairs, const scalar * magnitudes, const Vector3 * normals, Hamiltonian_Kernel_Data data) + __global__ void CU_Gradient_DMI(const Vector3 * spins, Vector3 * gradient, DMI_Data dmi, Hamiltonian_Kernel_Data data) { for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; icell < data.n_cells_total; icell += blockDim.x * gridDim.x) { - CU_Gradient_DMI_Cell(icell, spins, gradient, n_pairs, pairs, magnitudes, normals, data); + CU_Gradient_DMI_Cell(icell, spins, gradient, dmi, data); } } void Hamiltonian_Heisenberg::Gradient_DMI(const vectorfield & spins, vectorfield & gradient) { int size = geometry->n_cells_total; - CU_Gradient_DMI<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), this->dmi_pairs.size(), this->dmi_pairs.data(), this->dmi_magnitudes.data(), this->dmi_normals.data(), Hamiltonian_Kernel_Data(*this)); + CU_Gradient_DMI<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), DMI_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } @@ -1444,4 +1486,4 @@ namespace Engine const std::string& Hamiltonian_Heisenberg::Name() { return name; } } -#endif +// #endif \ No newline at end of file From 92d9a08abc20f3c030e428eb873dbf017888cd55 Mon Sep 17 00:00:00 2001 From: Moritz Date: Tue, 26 Jul 2022 15:13:56 +0200 Subject: [PATCH 4/8] Added a new struct that orders pairs according to the left interaction atom. --- core/include/engine/Pair_Sorting.hpp | 95 ++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 core/include/engine/Pair_Sorting.hpp diff --git a/core/include/engine/Pair_Sorting.hpp b/core/include/engine/Pair_Sorting.hpp new file mode 100644 index 000000000..bc1b7d762 --- /dev/null +++ b/core/include/engine/Pair_Sorting.hpp @@ -0,0 +1,95 @@ +#ifndef SPIRIT_CORE_PAIR_SORTING_HPP +#define SPIRIT_CORE_PAIR_SORTING_HPP +#include "Spirit_Defines.h" +#include +#include +#include +#include + +namespace Engine +{ + +struct Pair_Order +{ + field n_pairs_per_cell_atom; + field offset_per_cell_atom; + field indices; + + const field const * pairs_ref; + + Pair_Order() = default; + + Pair_Order( field & pairs, int n_cell_atoms ) + : n_pairs_per_cell_atom( field( n_cell_atoms, 0 ) ), + offset_per_cell_atom( field( n_cell_atoms, 0 ) ), + indices( field( pairs.size() ) ), + pairs_ref( &pairs ) + { + std::iota( indices.begin(), indices.end(), 0 ); + + auto pair_compare_function = [&]( const int & idx_l, const int & idx_r ) -> bool + { + const Pair & l = pairs[idx_l]; + const Pair & r = pairs[idx_r]; + return l.i < r.i; + }; + + std::sort( indices.begin(), indices.end(), pair_compare_function ); + + for( const auto & p : pairs ) + { + n_pairs_per_cell_atom[p.i]++; + } + + for( int i = 1; i < n_pairs_per_cell_atom.size(); i++ ) + { + offset_per_cell_atom[i] += offset_per_cell_atom[i - 1] + n_pairs_per_cell_atom[i - 1]; + } + } + + // Use the indices to sort an input field v + template + void Sort( field & v ) const + { + assert( v.size() == indices.size() ); + // We are pragmatic here and sort the array out of place + field v_copy = v; + for( int i = 0; i < indices.size(); i++ ) + { + v[i] = v_copy[indices[i]]; + } + } + + // For debugging + void Print( int n_pairs_print = 3 ) const + { + fmt::print( "== Pair Order ==\n" ); + + for(auto i : indices) + { + fmt::print("{} ", i); + } + fmt::print("\n"); + + fmt::print( "n_pairs_total = {}\n", pairs_ref->size() ); + for( int i = 0; i < n_pairs_per_cell_atom.size(); i++ ) + { + fmt::print( "Cell atom [{}]\n", i ); + fmt::print( " n_pairs = {}\n", n_pairs_per_cell_atom[i] ); + fmt::print( " offset = {}\n", offset_per_cell_atom[i] ); + fmt::print( " (Up to) first {} pairs:\n", n_pairs_print ); + + for( int j = 0; j < std::min( n_pairs_print, n_pairs_per_cell_atom[i] ); j++ ) + { + auto const & p = ( *pairs_ref )[offset_per_cell_atom[i] + j]; + fmt::print( + " - #{} = {:^4} {:^4} {:^4} {:^4} {:^4}\n", j, p.i, p.j, p.translations[0], p.translations[1], + p.translations[2] ); + } + } + } +}; + +} // namespace Engine + +#endif \ No newline at end of file From 5cb5043f8d2f60b0cfba4656b334799e9433ee6c Mon Sep 17 00:00:00 2001 From: Moritz Date: Tue, 26 Jul 2022 17:38:33 +0200 Subject: [PATCH 5/8] Hamiltonian_Heisenberg.cu: Used pair sorting to implement parallel exchange over spins --- .../include/engine/Hamiltonian_Heisenberg.hpp | 5 ++ core/src/engine/Hamiltonian_Heisenberg.cu | 64 ++++++++++++++----- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/core/include/engine/Hamiltonian_Heisenberg.hpp b/core/include/engine/Hamiltonian_Heisenberg.hpp index 896c45f0e..fef7b771f 100644 --- a/core/include/engine/Hamiltonian_Heisenberg.hpp +++ b/core/include/engine/Hamiltonian_Heisenberg.hpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace Engine { @@ -90,6 +91,8 @@ class Hamiltonian_Heisenberg : public Hamiltonian scalarfield exchange_magnitudes_in; pairfield exchange_pairs; scalarfield exchange_magnitudes; + Pair_Order exchange_pair_order; + // DMI scalarfield dmi_shell_magnitudes; int dmi_shell_chirality; @@ -99,6 +102,8 @@ class Hamiltonian_Heisenberg : public Hamiltonian pairfield dmi_pairs; scalarfield dmi_magnitudes; vectorfield dmi_normals; + Pair_Order dmi_pair_order; + // Dipole Dipole interaction DDI_Method ddi_method; intfield ddi_n_periodic_images; diff --git a/core/src/engine/Hamiltonian_Heisenberg.cu b/core/src/engine/Hamiltonian_Heisenberg.cu index da98c1573..6c1ba88f1 100644 --- a/core/src/engine/Hamiltonian_Heisenberg.cu +++ b/core/src/engine/Hamiltonian_Heisenberg.cu @@ -30,12 +30,13 @@ namespace Engine const int * n_cells; const int n_cell_atoms; const int n_cells_total; + const int nos; const int * atom_types; const scalar * mu_s; const int * boundary_conditions; Hamiltonian_Kernel_Data(const Hamiltonian_Heisenberg & ham) - : n_cells(ham.geometry->n_cells.data()), n_cell_atoms(ham.geometry->n_cell_atoms), n_cells_total(ham.geometry->n_cells_total), atom_types( ham.geometry->atom_types.data()), mu_s(ham.geometry->mu_s.data()), boundary_conditions(ham.boundary_conditions.data()) + : n_cells(ham.geometry->n_cells.data()), n_cell_atoms(ham.geometry->n_cell_atoms), n_cells_total(ham.geometry->n_cells_total), atom_types( ham.geometry->atom_types.data()), mu_s(ham.geometry->mu_s.data()), boundary_conditions(ham.boundary_conditions.data()), nos(ham.geometry->nos) { } }; @@ -69,7 +70,16 @@ namespace Engine const Pair * pairs; const int n_pairs; - Exchange_Data(const Hamiltonian_Heisenberg & ham) : magnitudes(ham.exchange_magnitudes.data()), pairs(ham.exchange_pairs.data()), n_pairs(ham.exchange_pairs.size()) {} + const int * n_pairs_per_cell_atom; + const int * offset_per_cell_atom; + + Exchange_Data(const Hamiltonian_Heisenberg & ham) : + magnitudes(ham.exchange_magnitudes.data()), + pairs(ham.exchange_pairs.data()), + n_pairs(ham.exchange_pairs.size()), + n_pairs_per_cell_atom(ham.exchange_pair_order.n_pairs_per_cell_atom.data()), + offset_per_cell_atom(ham.exchange_pair_order.offset_per_cell_atom.data()) + {} }; struct DMI_Data @@ -196,6 +206,10 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } } + exchange_pair_order = Pair_Order(this->exchange_pairs, this->geometry->n_cell_atoms); + exchange_pair_order.Sort(exchange_pairs); + exchange_pair_order.Sort(exchange_magnitudes); + // DMI this->dmi_pairs = pairfield(0); this->dmi_magnitudes = scalarfield(0); @@ -227,6 +241,11 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } } + dmi_pair_order = Pair_Order(this->dmi_pairs, this->geometry->n_cell_atoms); + dmi_pair_order.Sort(dmi_pairs); + dmi_pair_order.Sort(dmi_magnitudes); + dmi_pair_order.Sort(dmi_normals); + // Dipole-dipole (cutoff) scalar radius = this->ddi_cutoff_radius; if( this->ddi_method != DDI_Method::Cutoff ) @@ -409,19 +428,24 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( CU_CHECK_AND_SYNC(); } - __global__ void CU_E_Exchange(const Vector3 * spins, scalar * Energy, Exchange_Data exchange, Hamiltonian_Kernel_Data data) { int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; - for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < data.n_cells_total; - icell += blockDim.x * gridDim.x) + for(auto ispin = blockIdx.x * blockDim.x + threadIdx.x; + ispin < data.nos; + ispin += blockDim.x * gridDim.x) { - for(auto ipair = 0; ipair < exchange.n_pairs; ++ipair) + // Figure out the cell idx + const int cell_idx = ispin % data.n_cell_atoms; + + // Figure out the pairs we need to apply to this spin + int n_pairs = exchange.n_pairs_per_cell_atom[cell_idx]; + int offset_pairs = exchange.offset_per_cell_atom[cell_idx]; + + for(auto ipair = offset_pairs; ipair < offset_pairs + n_pairs; ++ipair) { - int ispin = exchange.pairs[ipair].i + icell* data.n_cell_atoms; int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, exchange.pairs[ipair]); if (jspin >= 0) { @@ -432,7 +456,7 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } void Hamiltonian_Heisenberg::E_Exchange(const vectorfield & spins, scalarfield & Energy) { - int size = geometry->n_cells_total; + int size = geometry->nos; CU_E_Exchange<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), Exchange_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } @@ -924,14 +948,20 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( CU_CHECK_AND_SYNC(); } - inline __device__ void CU_Gradient_Exchange_Cell(int icell, const Vector3 * spins, Vector3 * gradient, const Exchange_Data & exchange, const Hamiltonian_Kernel_Data & data) + inline __device__ void CU_Gradient_Exchange_Spin(int ispin, const Vector3 * spins, Vector3 * gradient, const Exchange_Data & exchange, const Hamiltonian_Kernel_Data & data) { const int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; const int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; - for(auto ipair = 0; ipair < exchange.n_pairs; ++ipair) + // Figure out the cell idx + const int cell_idx = ispin % data.n_cell_atoms; + + // Figure out the pairs we need to apply to this spin + int n_pairs = exchange.n_pairs_per_cell_atom[cell_idx]; + int offset_pairs = exchange.offset_per_cell_atom[cell_idx]; + + for(auto ipair = offset_pairs; ipair < offset_pairs + n_pairs; ++ipair) { - int ispin = exchange.pairs[ipair].i + icell*data.n_cell_atoms; int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, exchange.pairs[ipair]); if (jspin >= 0) { @@ -941,16 +971,16 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } __global__ void CU_Gradient_Exchange(const Vector3 * spins, Vector3 * gradient, Exchange_Data exchange, Hamiltonian_Kernel_Data data) { - for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < data.n_cells_total; - icell += blockDim.x * gridDim.x) + for(auto ispin = blockIdx.x * blockDim.x + threadIdx.x; + ispin < data.nos; + ispin += blockDim.x * gridDim.x) { - CU_Gradient_Exchange_Cell(icell, spins, gradient, exchange, data); + CU_Gradient_Exchange_Spin(ispin, spins, gradient, exchange, data); } } void Hamiltonian_Heisenberg::Gradient_Exchange(const vectorfield & spins, vectorfield & gradient) { - int size = geometry->n_cells_total; + int size = geometry->nos; CU_Gradient_Exchange<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), Exchange_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } From d6b889c7ef897be811c2efd5faa89ca4819549a3 Mon Sep 17 00:00:00 2001 From: Moritz Date: Wed, 27 Jul 2022 12:31:58 +0200 Subject: [PATCH 6/8] Pair_Sorting: Implemented Pair subsorting to slightly improve performance and some comment changes --- core/include/engine/Pair_Sorting.hpp | 41 ++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/core/include/engine/Pair_Sorting.hpp b/core/include/engine/Pair_Sorting.hpp index bc1b7d762..6d2da4bb4 100644 --- a/core/include/engine/Pair_Sorting.hpp +++ b/core/include/engine/Pair_Sorting.hpp @@ -13,46 +13,65 @@ struct Pair_Order { field n_pairs_per_cell_atom; field offset_per_cell_atom; - field indices; + std::vector indices; const field const * pairs_ref; Pair_Order() = default; - Pair_Order( field & pairs, int n_cell_atoms ) + Pair_Order( field & pairs, const field & n_cells, int n_cell_atoms ) : n_pairs_per_cell_atom( field( n_cell_atoms, 0 ) ), offset_per_cell_atom( field( n_cell_atoms, 0 ) ), - indices( field( pairs.size() ) ), + indices( std::vector( pairs.size() ) ), pairs_ref( &pairs ) { std::iota( indices.begin(), indices.end(), 0 ); - auto pair_compare_function = [&]( const int & idx_l, const int & idx_r ) -> bool - { + auto pair_compare_function = [&]( const int & idx_l, const int & idx_r ) -> bool { const Pair & l = pairs[idx_l]; const Pair & r = pairs[idx_r]; - return l.i < r.i; + + if( l.i < r.i ) + return true; + else if( l.i > r.i ) + return false; + + // sub-sorting + // if l.i == r.i, sort by the expected difference of the linear idx into the spins array + int d_idx_l + = l.j + + n_cell_atoms + * ( l.translations[0] + n_cells[0] * ( l.translations[1] + n_cells[1] * l.translations[2] ) ); + int d_idx_r + = r.j + + n_cell_atoms + * ( r.translations[0] + n_cells[0] * ( r.translations[1] + n_cells[1] * r.translations[2] ) ); + + return d_idx_l < d_idx_r; }; std::sort( indices.begin(), indices.end(), pair_compare_function ); + // Count how many pairs interact with each cell atom for( const auto & p : pairs ) { n_pairs_per_cell_atom[p.i]++; } + // Compute the offsets at which each cell atom has to look into the sorted pairs vector for( int i = 1; i < n_pairs_per_cell_atom.size(); i++ ) { offset_per_cell_atom[i] += offset_per_cell_atom[i - 1] + n_pairs_per_cell_atom[i - 1]; } } - // Use the indices to sort an input field v + // Use the indices to sort an input field template void Sort( field & v ) const { assert( v.size() == indices.size() ); - // We are pragmatic here and sort the array out of place + + // Sort the array out of place field v_copy = v; for( int i = 0; i < indices.size(); i++ ) { @@ -65,11 +84,11 @@ struct Pair_Order { fmt::print( "== Pair Order ==\n" ); - for(auto i : indices) + for( auto i : indices ) { - fmt::print("{} ", i); + fmt::print( "{} ", i ); } - fmt::print("\n"); + fmt::print( "\n" ); fmt::print( "n_pairs_total = {}\n", pairs_ref->size() ); for( int i = 0; i < n_pairs_per_cell_atom.size(); i++ ) From c7d78434c8e5b501e4243ff9690b3c41e5264795 Mon Sep 17 00:00:00 2001 From: Moritz Date: Wed, 27 Jul 2022 12:53:33 +0200 Subject: [PATCH 7/8] Hamiltonian_Heisenberg.cu: DMI now also uses sorted pairs --- core/src/engine/Hamiltonian_Heisenberg.cu | 59 +++++++++++++++-------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/core/src/engine/Hamiltonian_Heisenberg.cu b/core/src/engine/Hamiltonian_Heisenberg.cu index 6c1ba88f1..a2febb221 100644 --- a/core/src/engine/Hamiltonian_Heisenberg.cu +++ b/core/src/engine/Hamiltonian_Heisenberg.cu @@ -89,7 +89,17 @@ namespace Engine const Vector3 * normals; const int n_pairs; - DMI_Data(const Hamiltonian_Heisenberg & ham) : magnitudes(ham.dmi_magnitudes.data()), pairs(ham.dmi_pairs.data()), normals(ham.dmi_normals.data()), n_pairs(ham.exchange_pairs.size()) {} + const int * n_pairs_per_cell_atom; + const int * offset_per_cell_atom; + + DMI_Data(const Hamiltonian_Heisenberg & ham) : + magnitudes(ham.dmi_magnitudes.data()), + pairs(ham.dmi_pairs.data()), + normals(ham.dmi_normals.data()), + n_pairs(ham.exchange_pairs.size()), + n_pairs_per_cell_atom(ham.dmi_pair_order.n_pairs_per_cell_atom.data()), + offset_per_cell_atom(ham.dmi_pair_order.offset_per_cell_atom.data()) + {} }; // Construct a Heisenberg Hamiltonian with pairs @@ -206,7 +216,7 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } } - exchange_pair_order = Pair_Order(this->exchange_pairs, this->geometry->n_cell_atoms); + exchange_pair_order = Pair_Order(this->exchange_pairs, this->geometry->n_cells, this->geometry->n_cell_atoms); exchange_pair_order.Sort(exchange_pairs); exchange_pair_order.Sort(exchange_magnitudes); @@ -241,7 +251,7 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } } - dmi_pair_order = Pair_Order(this->dmi_pairs, this->geometry->n_cell_atoms); + dmi_pair_order = Pair_Order(this->dmi_pairs, this->geometry->n_cells, this->geometry->n_cell_atoms); dmi_pair_order.Sort(dmi_pairs); dmi_pair_order.Sort(dmi_magnitudes); dmi_pair_order.Sort(dmi_normals); @@ -461,19 +471,24 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( CU_CHECK_AND_SYNC(); } - __global__ void CU_E_DMI(const Vector3 * spins, scalar * Energy, DMI_Data dmi, Hamiltonian_Kernel_Data data) { int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; - for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < data.n_cells_total; - icell += blockDim.x * gridDim.x) + for(auto ispin = blockIdx.x * blockDim.x + threadIdx.x; + ispin < data.nos; + ispin += blockDim.x * gridDim.x) { - for(auto ipair = 0; ipair < dmi.n_pairs; ++ipair) + // Figure out the index of the curretn spin within its cell + const int cell_idx = ispin % data.n_cell_atoms; + + // Figure out the pairs we need to apply to this spin + int n_pairs = dmi.n_pairs_per_cell_atom[cell_idx]; + int offset_pairs = dmi.offset_per_cell_atom[cell_idx]; + + for(auto ipair = offset_pairs; ipair < offset_pairs + n_pairs; ++ipair) { - int ispin = dmi.pairs[ipair].i + icell*data.n_cell_atoms; int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, dmi.pairs[ipair]); if (jspin >= 0) { @@ -484,7 +499,7 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } void Hamiltonian_Heisenberg::E_DMI(const vectorfield & spins, scalarfield & Energy) { - int size = geometry->n_cells_total; + int size = geometry->nos; CU_E_DMI<<<(size+1023)/1024, 1024>>>(spins.data(), Energy.data(), DMI_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); @@ -953,7 +968,7 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( const int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; const int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; - // Figure out the cell idx + // Figure out the index of the current spin within its cell const int cell_idx = ispin % data.n_cell_atoms; // Figure out the pairs we need to apply to this spin @@ -985,14 +1000,20 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( CU_CHECK_AND_SYNC(); } - inline __device__ void CU_Gradient_DMI_Cell(int icell, const Vector3 * spins, Vector3 * gradient, const DMI_Data & dmi, const Hamiltonian_Kernel_Data & data) + inline __device__ void CU_Gradient_DMI_Spin(int ispin, const Vector3 * spins, Vector3 * gradient, const DMI_Data & dmi, const Hamiltonian_Kernel_Data & data) { const int bc[3] = {data.boundary_conditions[0], data.boundary_conditions[1], data.boundary_conditions[2]}; const int nc[3] = {data.n_cells[0], data.n_cells[1], data.n_cells[2]}; - for(auto ipair = 0; ipair < dmi.n_pairs; ++ipair) + // Figure out the index of the current spin within its cell + const int cell_idx = ispin % data.n_cell_atoms; + + // Figure out the pairs we need to apply to this spin + int n_pairs = dmi.n_pairs_per_cell_atom[cell_idx]; + int offset_pairs = dmi.offset_per_cell_atom[cell_idx]; + + for(auto ipair = offset_pairs; ipair < offset_pairs + n_pairs; ++ipair) { - int ispin = dmi.pairs[ipair].i + icell*data.n_cell_atoms; int jspin = cu_idx_from_pair(ispin, bc, nc, data.n_cell_atoms, data.atom_types, dmi.pairs[ipair]); if (jspin >= 0) { @@ -1002,16 +1023,16 @@ Hamiltonian_Heisenberg::Hamiltonian_Heisenberg( } __global__ void CU_Gradient_DMI(const Vector3 * spins, Vector3 * gradient, DMI_Data dmi, Hamiltonian_Kernel_Data data) { - for(auto icell = blockIdx.x * blockDim.x + threadIdx.x; - icell < data.n_cells_total; - icell += blockDim.x * gridDim.x) + for(auto ispin = blockIdx.x * blockDim.x + threadIdx.x; + ispin < data.nos; + ispin += blockDim.x * gridDim.x) { - CU_Gradient_DMI_Cell(icell, spins, gradient, dmi, data); + CU_Gradient_DMI_Spin(ispin, spins, gradient, dmi, data); } } void Hamiltonian_Heisenberg::Gradient_DMI(const vectorfield & spins, vectorfield & gradient) { - int size = geometry->n_cells_total; + int size = geometry->nos; CU_Gradient_DMI<<<(size+1023)/1024, 1024>>>( spins.data(), gradient.data(), DMI_Data(*this), Hamiltonian_Kernel_Data(*this)); CU_CHECK_AND_SYNC(); } From 6b924f201c1c722de9dcd8447304eaaefc679b4a Mon Sep 17 00:00:00 2001 From: Moritz Date: Wed, 27 Jul 2022 12:56:25 +0200 Subject: [PATCH 8/8] Geometry: check for co-incident positions is now only performed if n_cell_atoms<100 The reason is that this check scales quadratically in n_cell_atoms. --- core/src/data/Geometry.cpp | 60 +++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/core/src/data/Geometry.cpp b/core/src/data/Geometry.cpp index 5b9cf2834..3304a6534 100644 --- a/core/src/data/Geometry.cpp +++ b/core/src/data/Geometry.cpp @@ -98,39 +98,47 @@ void Geometry::generatePositions() std::int64_t max_b = std::min( 10, n_cells[1] ); std::int64_t max_c = std::min( 10, n_cells[2] ); Vector3 diff; - for( std::int64_t i = 0; i < n_cell_atoms; ++i ) + + // This scales quadratically in n_cell_atoms, so we check for co-incident positions only if n_cell_atoms is somewhat small! + if( n_cell_atoms < 100 ) { - for( std::int64_t j = 0; j < n_cell_atoms; ++j ) + for( std::int64_t i = 0; i < n_cell_atoms; ++i ) { - for( std::int64_t da = -max_a; da <= max_a; ++da ) + for( std::int64_t j = 0; j < n_cell_atoms; ++j ) { - for( std::int64_t db = -max_b; db <= max_b; ++db ) + for( std::int64_t da = -max_a; da <= max_a; ++da ) { - for( std::int64_t dc = -max_c; dc <= max_c; ++dc ) + for( std::int64_t db = -max_b; db <= max_b; ++db ) { - // Check if translated basis atom is at position of another basis atom - diff = cell_atoms[i] - ( cell_atoms[j] + Vector3{ scalar( da ), scalar( db ), scalar( dc ) } ); + for( std::int64_t dc = -max_c; dc <= max_c; ++dc ) + { + // Check if translated basis atom is at position of another basis atom + diff = cell_atoms[i] + - ( cell_atoms[j] + Vector3{ scalar( da ), scalar( db ), scalar( dc ) } ); - bool same_position = std::abs( diff[0] ) < epsilon && std::abs( diff[1] ) < epsilon - && std::abs( diff[2] ) < epsilon; + bool same_position = std::abs( diff[0] ) < epsilon && std::abs( diff[1] ) < epsilon + && std::abs( diff[2] ) < epsilon; - if( same_position && ( i != j || da != 0 || db != 0 || dc != 0 ) ) - { - Vector3 position - = lattice_constant - * ( ( static_cast( da ) + cell_atoms[i][0] ) * bravais_vectors[0] - + ( static_cast( db ) + cell_atoms[i][1] ) * bravais_vectors[1] - + ( static_cast( dc ) + cell_atoms[i][2] ) * bravais_vectors[2] ); - - std::string message = fmt::format( - "Unable to initialize spin-system, because for a translation vector ({} {} {}), spins " - "{} and {} of the basis-cell occupy the same absolute position ({}) within a margin of " - "{} Angstrom. Please check the config file!", - da, db, dc, i, j, position.transpose(), epsilon ); - - spirit_throw( - Utility::Exception_Classifier::System_not_Initialized, Utility::Log_Level::Severe, - message ); + if( same_position && ( i != j || da != 0 || db != 0 || dc != 0 ) ) + { + Vector3 position + = lattice_constant + * ( ( static_cast( da ) + cell_atoms[i][0] ) * bravais_vectors[0] + + ( static_cast( db ) + cell_atoms[i][1] ) * bravais_vectors[1] + + ( static_cast( dc ) + cell_atoms[i][2] ) * bravais_vectors[2] ); + + std::string message = fmt::format( + "Unable to initialize spin-system, because for a translation vector ({} {} {}), " + "spins " + "{} and {} of the basis-cell occupy the same absolute position ({}) within a " + "margin of " + "{} Angstrom. Please check the config file!", + da, db, dc, i, j, position.transpose(), epsilon ); + + spirit_throw( + Utility::Exception_Classifier::System_not_Initialized, Utility::Log_Level::Severe, + message ); + } } } }