From 32c9f06517428ce073fb6d112fec65f600ec52f8 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Sat, 14 May 2022 15:58:51 +0530 Subject: [PATCH 1/3] using work-group local memory for keeping (inverse) look up table Signed-off-by: Anjan Roy --- include/harpocrates.hpp | 106 +++++++++ include/harpocrates_parallel.hpp | 41 +++- include/harpocrates_utils.hpp | 388 +++++++++++++++++++++++++++++++ 3 files changed, 533 insertions(+), 2 deletions(-) diff --git a/include/harpocrates.hpp b/include/harpocrates.hpp index e53ca99..0b87fed 100644 --- a/include/harpocrates.hpp +++ b/include/harpocrates.hpp @@ -15,6 +15,57 @@ namespace harpocrates { // // Output: // - enc: 16 encrypted output bytes +#if defined HARPOCRATES_PARALLEL +static inline void +encrypt(const sycl::local_ptr lut, // look up table + const uint8_t* const __restrict txt, // input plain text + uint8_t* const __restrict enc // output encrypted bytes +) +{ + uint16_t state[8] = { 0u }; + + constexpr size_t itr_cnt = harpocrates_common::N_ROWS >> 1; + +#if defined __clang__ +#pragma unroll 4 +#elif defined __GNUG__ +#pragma GCC ivdep +#pragma GCC unroll 4 +#endif + for (size_t i = 0; i < itr_cnt; i++) { + const size_t b_off = i << 2; + const size_t r_off = i << 1; + + state[r_off ^ 0] = (static_cast(txt[b_off ^ 0]) << 8) | + static_cast(txt[b_off ^ 1]); + state[r_off ^ 1] = (static_cast(txt[b_off ^ 2]) << 8) | + static_cast(txt[b_off ^ 3]); + } + + for (size_t i = 0; i < harpocrates_common::N_ROUNDS; i++) { + harpocrates_utils::left_to_right_convoluted_substitution(state, lut); + harpocrates_utils::add_rc(state, i); + harpocrates_utils::column_substitution(state, lut); + harpocrates_utils::right_to_left_convoluted_substitution(state, lut); + } + +#if defined __clang__ +#pragma unroll 4 +#elif defined __GNUG__ +#pragma GCC ivdep +#pragma GCC unroll 4 +#endif + for (size_t i = 0; i < itr_cnt; i++) { + const size_t b_off = i << 2; + const size_t r_off = i << 1; + + enc[b_off ^ 0] = static_cast(state[r_off ^ 0] >> 8); + enc[b_off ^ 1] = static_cast(state[r_off ^ 0]); + enc[b_off ^ 2] = static_cast(state[r_off ^ 1] >> 8); + enc[b_off ^ 3] = static_cast(state[r_off ^ 1]); + } +} +#else static inline void encrypt(const uint8_t* const __restrict lut, // look up table const uint8_t* const __restrict txt, // input plain text @@ -64,6 +115,7 @@ encrypt(const uint8_t* const __restrict lut, // look up table enc[b_off ^ 3] = static_cast(state[r_off ^ 1]); } } +#endif // Given 16 -bytes of encrypted input message block & inverse look up table // ( read `inv_lut` ) of size 256 ( because each LUT element is of size 8 -bit @@ -82,6 +134,59 @@ encrypt(const uint8_t* const __restrict lut, // look up table // inv_lut = harpocrates_utils::generate_inv_lut(lut) // // where `lut` is the same look up table used during encryption. +#if defined HARPOCRATES_PARALLEL +static inline void +decrypt(const sycl::local_ptr inv_lut, // inverse look up table + const uint8_t* const __restrict enc, // input encrypted bytes + uint8_t* const __restrict dec // output decrypted bytes +) +{ + uint16_t state[8] = { 0u }; + + constexpr size_t itr_cnt = harpocrates_common::N_ROWS >> 1; + +#if defined __clang__ +#pragma unroll 4 +#elif defined __GNUG__ +#pragma GCC ivdep +#pragma GCC unroll 4 +#endif + for (size_t i = 0; i < itr_cnt; i++) { + const size_t b_off = i << 2; + const size_t r_off = i << 1; + + state[r_off ^ 0] = (static_cast(enc[b_off ^ 0]) << 8) | + static_cast(enc[b_off ^ 1]); + state[r_off ^ 1] = (static_cast(enc[b_off ^ 2]) << 8) | + static_cast(enc[b_off ^ 3]); + } + + for (size_t i = 0; i < harpocrates_common::N_ROUNDS; i++) { + using namespace harpocrates_utils; + + left_to_right_convoluted_substitution(state, inv_lut); + column_substitution(state, inv_lut); + add_rc(state, harpocrates_common::N_ROUNDS - (i + 1)); + right_to_left_convoluted_substitution(state, inv_lut); + } + +#if defined __clang__ +#pragma unroll 4 +#elif defined __GNUG__ +#pragma GCC ivdep +#pragma GCC unroll 4 +#endif + for (size_t i = 0; i < itr_cnt; i++) { + const size_t b_off = i << 2; + const size_t r_off = i << 1; + + dec[b_off ^ 0] = static_cast(state[r_off ^ 0] >> 8); + dec[b_off ^ 1] = static_cast(state[r_off ^ 0]); + dec[b_off ^ 2] = static_cast(state[r_off ^ 1] >> 8); + dec[b_off ^ 3] = static_cast(state[r_off ^ 1]); + } +} +#else static inline void decrypt(const uint8_t* const __restrict inv_lut, // inverse look up table const uint8_t* const __restrict enc, // input encrypted bytes @@ -133,5 +238,6 @@ decrypt(const uint8_t* const __restrict inv_lut, // inverse look up table dec[b_off ^ 3] = static_cast(state[r_off ^ 1]); } } +#endif } diff --git a/include/harpocrates_parallel.hpp b/include/harpocrates_parallel.hpp index 5ca8645..3eb2993 100644 --- a/include/harpocrates_parallel.hpp +++ b/include/harpocrates_parallel.hpp @@ -1,4 +1,9 @@ #pragma once + +#if !defined HARPOCRATES_PARALLEL +#define HARPOCRATES_PARALLEL +#endif + #include "harpocrates.hpp" #include #include @@ -70,12 +75,28 @@ encrypt(sycl::queue& q, // SYCL queue // create dependency graph h.depends_on(evts); + sycl::accessor + loc_lut{ sycl::range<1>{ 256ul }, h }; + const auto rng = sycl::nd_range<1>{ wi_cnt, wg_size }; h.parallel_for(rng, [=](sycl::nd_item<1> it) { + const auto grp = it.get_group(); + + if (it.get_local_linear_id() == 0ul) { + for (size_t i = 0; i < 256ul; i++) { + loc_lut[i] = lut[i]; + } + } + + sycl::group_barrier(grp, sycl::memory_scope_work_group); + const size_t idx = it.get_global_linear_id(); const size_t b_off = idx << 4; - harpocrates::encrypt(lut, txt + b_off, enc + b_off); + harpocrates::encrypt(loc_lut.get_pointer(), txt + b_off, enc + b_off); }); }); } @@ -120,12 +141,28 @@ decrypt( // create dependency graph h.depends_on(evts); + sycl::accessor + loc_inv_lut{ sycl::range<1>{ 256ul }, h }; + const auto rng = sycl::nd_range<1>{ wi_cnt, wg_size }; h.parallel_for(rng, [=](sycl::nd_item<1> it) { + const auto grp = it.get_group(); + + if (it.get_local_linear_id() == 0ul) { + for (size_t i = 0; i < 256ul; i++) { + loc_inv_lut[i] = inv_lut[i]; + } + } + + sycl::group_barrier(grp, sycl::memory_scope_work_group); + const size_t idx = it.get_global_linear_id(); const size_t b_off = idx << 4; - harpocrates::decrypt(inv_lut, enc + b_off, dec + b_off); + harpocrates::decrypt(loc_inv_lut.get_pointer(), enc + b_off, dec + b_off); }); }); } diff --git a/include/harpocrates_utils.hpp b/include/harpocrates_utils.hpp index 7db5dbb..519f2e5 100644 --- a/include/harpocrates_utils.hpp +++ b/include/harpocrates_utils.hpp @@ -4,6 +4,10 @@ #include #include +#if defined HARPOCRATES_PARALLEL +#include +#endif + // Harpocrates - An Efficient Encryption Mechanism for Data-at-rest, related // utility functions namespace harpocrates_utils { @@ -96,6 +100,56 @@ generate_inv_lut(const uint8_t* const __restrict lut, // // Also see figure 4 of above linked document to better understand workings of // this procedure +#if defined HARPOCRATES_PARALLEL +static inline void +left_to_right_convoluted_substitution(uint16_t* const __restrict state, + const sycl::local_ptr lut) +{ +#if defined __clang__ +#pragma unroll 8 +#elif defined __GNUG__ +#pragma GCC ivdep +#pragma GCC unroll 8 +#endif + for (size_t i = 0; i < harpocrates_common::N_ROWS; i++) { + const uint16_t row = state[i]; + + const uint8_t lo = static_cast(row); + + const uint8_t lo_msb0 = lo >> 6; + const uint8_t lo_msb2 = (lo >> 4) & 0b11; + const uint8_t lo_msb4 = (lo >> 2) & 0b11; + const uint8_t lo_msb6 = lo & 0b11; + + // step 1 + const uint8_t t0 = static_cast(row >> 8); + const uint8_t t1 = lut[t0]; + const uint8_t msb0 = t1 & 0b11000000; + + // step 2 + const uint8_t t2 = (t1 << 2) | lo_msb0; + const uint8_t t3 = lut[t2]; + const uint8_t msb2 = (t3 & 0b11000000) >> 2; + + // step 3 + const uint8_t t4 = (t3 << 2) | lo_msb2; + const uint8_t t5 = lut[t4]; + const uint8_t msb4 = (t5 & 0b11000000) >> 4; + + // step 4 + const uint8_t t6 = (t5 << 2) | lo_msb4; + const uint8_t t7 = lut[t6]; + const uint8_t msb6 = (t7 & 0b11000000) >> 6; + + // step 5 + const uint8_t t8 = (t7 << 2) | lo_msb6; + const uint8_t t9 = lut[t8]; + + const uint8_t hi = msb0 | msb2 | msb4 | msb6; + state[i] = (static_cast(hi) << 8) | static_cast(t9); + } +} +#else static inline void left_to_right_convoluted_substitution(uint16_t* const __restrict state, const uint8_t* const __restrict lut) @@ -144,6 +198,7 @@ left_to_right_convoluted_substitution(uint16_t* const __restrict state, state[i] = (static_cast(hi) << 8) | static_cast(t9); } } +#endif // Adds round constants into state matrix, to break the round's self-similarity // @@ -166,6 +221,287 @@ add_rc(uint16_t* const state, const size_t r_idx) // Column substitution for diffusing value of each row, taken from algorithm 3 // described in section 2.3 of Harpocrates specification // https://eprint.iacr.org/2022/519.pdf +#if defined HARPOCRATES_PARALLEL +static inline void +column_substitution(uint16_t* const __restrict state, + const sycl::local_ptr lut) +{ + const uint8_t row0_hi = static_cast(state[0] >> 8); + const uint8_t row1_hi = static_cast(state[1] >> 8); + const uint8_t row2_hi = static_cast(state[2] >> 8); + const uint8_t row3_hi = static_cast(state[3] >> 8); + const uint8_t row4_hi = static_cast(state[4] >> 8); + const uint8_t row5_hi = static_cast(state[5] >> 8); + const uint8_t row6_hi = static_cast(state[6] >> 8); + const uint8_t row7_hi = static_cast(state[7] >> 8); + + const uint8_t col0 = + ((row0_hi & 0b10000000) >> 0) | ((row1_hi & 0b10000000) >> 1) | + ((row2_hi & 0b10000000) >> 2) | ((row3_hi & 0b10000000) >> 3) | + ((row4_hi & 0b10000000) >> 4) | ((row5_hi & 0b10000000) >> 5) | + ((row6_hi & 0b10000000) >> 6) | ((row7_hi & 0b10000000) >> 7); + const uint8_t scol0 = lut[col0]; + + const uint8_t col1 = + ((row0_hi & 0b01000000) << 1) | ((row1_hi & 0b01000000) >> 0) | + ((row2_hi & 0b01000000) >> 1) | ((row3_hi & 0b01000000) >> 2) | + ((row4_hi & 0b01000000) >> 3) | ((row5_hi & 0b01000000) >> 4) | + ((row6_hi & 0b01000000) >> 5) | ((row7_hi & 0b01000000) >> 6); + const uint8_t scol1 = lut[col1]; + + const uint8_t col2 = + ((row0_hi & 0b00100000) << 2) | ((row1_hi & 0b00100000) << 1) | + ((row2_hi & 0b00100000) >> 0) | ((row3_hi & 0b00100000) >> 1) | + ((row4_hi & 0b00100000) >> 2) | ((row5_hi & 0b00100000) >> 3) | + ((row6_hi & 0b00100000) >> 4) | ((row7_hi & 0b00100000) >> 5); + const uint8_t scol2 = lut[col2]; + + const uint8_t col3 = + ((row0_hi & 0b00010000) << 3) | ((row1_hi & 0b00010000) << 2) | + ((row2_hi & 0b00010000) << 1) | ((row3_hi & 0b00010000) >> 0) | + ((row4_hi & 0b00010000) >> 1) | ((row5_hi & 0b00010000) >> 2) | + ((row6_hi & 0b00010000) >> 3) | ((row7_hi & 0b00010000) >> 4); + const uint8_t scol3 = lut[col3]; + + const uint8_t col4 = + ((row0_hi & 0b00001000) << 4) | ((row1_hi & 0b00001000) << 3) | + ((row2_hi & 0b00001000) << 2) | ((row3_hi & 0b00001000) << 1) | + ((row4_hi & 0b00001000) >> 0) | ((row5_hi & 0b00001000) >> 1) | + ((row6_hi & 0b00001000) >> 2) | ((row7_hi & 0b00001000) >> 3); + const uint8_t scol4 = lut[col4]; + + const uint8_t col5 = + ((row0_hi & 0b00000100) << 5) | ((row1_hi & 0b00000100) << 4) | + ((row2_hi & 0b00000100) << 3) | ((row3_hi & 0b00000100) << 2) | + ((row4_hi & 0b00000100) << 1) | ((row5_hi & 0b00000100) >> 0) | + ((row6_hi & 0b00000100) >> 1) | ((row7_hi & 0b00000100) >> 2); + const uint8_t scol5 = lut[col5]; + + const uint8_t col6 = + ((row0_hi & 0b00000010) << 6) | ((row1_hi & 0b00000010) << 5) | + ((row2_hi & 0b00000010) << 4) | ((row3_hi & 0b00000010) << 3) | + ((row4_hi & 0b00000010) << 2) | ((row5_hi & 0b00000010) << 1) | + ((row6_hi & 0b00000010) >> 0) | ((row7_hi & 0b00000010) >> 1); + const uint8_t scol6 = lut[col6]; + + const uint8_t col7 = + ((row0_hi & 0b00000001) << 7) | ((row1_hi & 0b00000001) << 6) | + ((row2_hi & 0b00000001) << 5) | ((row3_hi & 0b00000001) << 4) | + ((row4_hi & 0b00000001) << 3) | ((row5_hi & 0b00000001) << 2) | + ((row6_hi & 0b00000001) << 1) | ((row7_hi & 0b00000001) << 0); + const uint8_t scol7 = lut[col7]; + + const uint8_t row0_lo = static_cast(state[0]); + const uint8_t row1_lo = static_cast(state[1]); + const uint8_t row2_lo = static_cast(state[2]); + const uint8_t row3_lo = static_cast(state[3]); + const uint8_t row4_lo = static_cast(state[4]); + const uint8_t row5_lo = static_cast(state[5]); + const uint8_t row6_lo = static_cast(state[6]); + const uint8_t row7_lo = static_cast(state[7]); + + const uint8_t col8 = + ((row0_lo & 0b10000000) >> 0) | ((row1_lo & 0b10000000) >> 1) | + ((row2_lo & 0b10000000) >> 2) | ((row3_lo & 0b10000000) >> 3) | + ((row4_lo & 0b10000000) >> 4) | ((row5_lo & 0b10000000) >> 5) | + ((row6_lo & 0b10000000) >> 6) | ((row7_lo & 0b10000000) >> 7); + const uint8_t scol8 = lut[col8]; + + const uint8_t col9 = + ((row0_lo & 0b01000000) << 1) | ((row1_lo & 0b01000000) >> 0) | + ((row2_lo & 0b01000000) >> 1) | ((row3_lo & 0b01000000) >> 2) | + ((row4_lo & 0b01000000) >> 3) | ((row5_lo & 0b01000000) >> 4) | + ((row6_lo & 0b01000000) >> 5) | ((row7_lo & 0b01000000) >> 6); + const uint8_t scol9 = lut[col9]; + + const uint8_t col10 = + ((row0_lo & 0b00100000) << 2) | ((row1_lo & 0b00100000) << 1) | + ((row2_lo & 0b00100000) >> 0) | ((row3_lo & 0b00100000) >> 1) | + ((row4_lo & 0b00100000) >> 2) | ((row5_lo & 0b00100000) >> 3) | + ((row6_lo & 0b00100000) >> 4) | ((row7_lo & 0b00100000) >> 5); + const uint8_t scol10 = lut[col10]; + + const uint8_t col11 = + ((row0_lo & 0b00010000) << 3) | ((row1_lo & 0b00010000) << 2) | + ((row2_lo & 0b00010000) << 1) | ((row3_lo & 0b00010000) >> 0) | + ((row4_lo & 0b00010000) >> 1) | ((row5_lo & 0b00010000) >> 2) | + ((row6_lo & 0b00010000) >> 3) | ((row7_lo & 0b00010000) >> 4); + const uint8_t scol11 = lut[col11]; + + const uint8_t col12 = + ((row0_lo & 0b00001000) << 4) | ((row1_lo & 0b00001000) << 3) | + ((row2_lo & 0b00001000) << 2) | ((row3_lo & 0b00001000) << 1) | + ((row4_lo & 0b00001000) >> 0) | ((row5_lo & 0b00001000) >> 1) | + ((row6_lo & 0b00001000) >> 2) | ((row7_lo & 0b00001000) >> 3); + const uint8_t scol12 = lut[col12]; + + const uint8_t col13 = + ((row0_lo & 0b00000100) << 5) | ((row1_lo & 0b00000100) << 4) | + ((row2_lo & 0b00000100) << 3) | ((row3_lo & 0b00000100) << 2) | + ((row4_lo & 0b00000100) << 1) | ((row5_lo & 0b00000100) >> 0) | + ((row6_lo & 0b00000100) >> 1) | ((row7_lo & 0b00000100) >> 2); + const uint8_t scol13 = lut[col13]; + + const uint8_t col14 = + ((row0_lo & 0b00000010) << 6) | ((row1_lo & 0b00000010) << 5) | + ((row2_lo & 0b00000010) << 4) | ((row3_lo & 0b00000010) << 3) | + ((row4_lo & 0b00000010) << 2) | ((row5_lo & 0b00000010) << 1) | + ((row6_lo & 0b00000010) >> 0) | ((row7_lo & 0b00000010) >> 1); + const uint8_t scol14 = lut[col14]; + + const uint8_t col15 = + ((row0_lo & 0b00000001) << 7) | ((row1_lo & 0b00000001) << 6) | + ((row2_lo & 0b00000001) << 5) | ((row3_lo & 0b00000001) << 4) | + ((row4_lo & 0b00000001) << 3) | ((row5_lo & 0b00000001) << 2) | + ((row6_lo & 0b00000001) << 1) | ((row7_lo & 0b00000001) << 0); + const uint8_t scol15 = lut[col15]; + + const uint16_t row0 = (static_cast(scol0 & 0b10000000) << 8) | + (static_cast(scol1 & 0b10000000) << 7) | + (static_cast(scol2 & 0b10000000) << 6) | + (static_cast(scol3 & 0b10000000) << 5) | + (static_cast(scol4 & 0b10000000) << 4) | + (static_cast(scol5 & 0b10000000) << 3) | + (static_cast(scol6 & 0b10000000) << 2) | + (static_cast(scol7 & 0b10000000) << 1) | + (static_cast(scol8 & 0b10000000) >> 0) | + (static_cast(scol9 & 0b10000000) >> 1) | + (static_cast(scol10 & 0b10000000) >> 2) | + (static_cast(scol11 & 0b10000000) >> 3) | + (static_cast(scol12 & 0b10000000) >> 4) | + (static_cast(scol13 & 0b10000000) >> 5) | + (static_cast(scol14 & 0b10000000) >> 6) | + (static_cast(scol15 & 0b10000000) >> 7); + + const uint16_t row1 = (static_cast(scol0 & 0b01000000) << 9) | + (static_cast(scol1 & 0b01000000) << 8) | + (static_cast(scol2 & 0b01000000) << 7) | + (static_cast(scol3 & 0b01000000) << 6) | + (static_cast(scol4 & 0b01000000) << 5) | + (static_cast(scol5 & 0b01000000) << 4) | + (static_cast(scol6 & 0b01000000) << 3) | + (static_cast(scol7 & 0b01000000) << 2) | + (static_cast(scol8 & 0b01000000) << 1) | + (static_cast(scol9 & 0b01000000) >> 0) | + (static_cast(scol10 & 0b01000000) >> 1) | + (static_cast(scol11 & 0b01000000) >> 2) | + (static_cast(scol12 & 0b01000000) >> 3) | + (static_cast(scol13 & 0b01000000) >> 4) | + (static_cast(scol14 & 0b01000000) >> 5) | + (static_cast(scol15 & 0b01000000) >> 6); + + const uint16_t row2 = (static_cast(scol0 & 0b00100000) << 10) | + (static_cast(scol1 & 0b00100000) << 9) | + (static_cast(scol2 & 0b00100000) << 8) | + (static_cast(scol3 & 0b00100000) << 7) | + (static_cast(scol4 & 0b00100000) << 6) | + (static_cast(scol5 & 0b00100000) << 5) | + (static_cast(scol6 & 0b00100000) << 4) | + (static_cast(scol7 & 0b00100000) << 3) | + (static_cast(scol8 & 0b00100000) << 2) | + (static_cast(scol9 & 0b00100000) << 1) | + (static_cast(scol10 & 0b00100000) >> 0) | + (static_cast(scol11 & 0b00100000) >> 1) | + (static_cast(scol12 & 0b00100000) >> 2) | + (static_cast(scol13 & 0b00100000) >> 3) | + (static_cast(scol14 & 0b00100000) >> 4) | + (static_cast(scol15 & 0b00100000) >> 5); + + const uint16_t row3 = (static_cast(scol0 & 0b00010000) << 11) | + (static_cast(scol1 & 0b00010000) << 10) | + (static_cast(scol2 & 0b00010000) << 9) | + (static_cast(scol3 & 0b00010000) << 8) | + (static_cast(scol4 & 0b00010000) << 7) | + (static_cast(scol5 & 0b00010000) << 6) | + (static_cast(scol6 & 0b00010000) << 5) | + (static_cast(scol7 & 0b00010000) << 4) | + (static_cast(scol8 & 0b00010000) << 3) | + (static_cast(scol9 & 0b00010000) << 2) | + (static_cast(scol10 & 0b00010000) << 1) | + (static_cast(scol11 & 0b00010000) >> 0) | + (static_cast(scol12 & 0b00010000) >> 1) | + (static_cast(scol13 & 0b00010000) >> 2) | + (static_cast(scol14 & 0b00010000) >> 3) | + (static_cast(scol15 & 0b00010000) >> 4); + + const uint16_t row4 = (static_cast(scol0 & 0b00001000) << 12) | + (static_cast(scol1 & 0b00001000) << 11) | + (static_cast(scol2 & 0b00001000) << 10) | + (static_cast(scol3 & 0b00001000) << 9) | + (static_cast(scol4 & 0b00001000) << 8) | + (static_cast(scol5 & 0b00001000) << 7) | + (static_cast(scol6 & 0b00001000) << 6) | + (static_cast(scol7 & 0b00001000) << 5) | + (static_cast(scol8 & 0b00001000) << 4) | + (static_cast(scol9 & 0b00001000) << 3) | + (static_cast(scol10 & 0b00001000) << 2) | + (static_cast(scol11 & 0b00001000) << 1) | + (static_cast(scol12 & 0b00001000) >> 0) | + (static_cast(scol13 & 0b00001000) >> 1) | + (static_cast(scol14 & 0b00001000) >> 2) | + (static_cast(scol15 & 0b00001000) >> 3); + + const uint16_t row5 = (static_cast(scol0 & 0b00000100) << 13) | + (static_cast(scol1 & 0b00000100) << 12) | + (static_cast(scol2 & 0b00000100) << 11) | + (static_cast(scol3 & 0b00000100) << 10) | + (static_cast(scol4 & 0b00000100) << 9) | + (static_cast(scol5 & 0b00000100) << 8) | + (static_cast(scol6 & 0b00000100) << 7) | + (static_cast(scol7 & 0b00000100) << 6) | + (static_cast(scol8 & 0b00000100) << 5) | + (static_cast(scol9 & 0b00000100) << 4) | + (static_cast(scol10 & 0b00000100) << 3) | + (static_cast(scol11 & 0b00000100) << 2) | + (static_cast(scol12 & 0b00000100) << 1) | + (static_cast(scol13 & 0b00000100) >> 0) | + (static_cast(scol14 & 0b00000100) >> 1) | + (static_cast(scol15 & 0b00000100) >> 2); + + const uint16_t row6 = (static_cast(scol0 & 0b00000010) << 14) | + (static_cast(scol1 & 0b00000010) << 13) | + (static_cast(scol2 & 0b00000010) << 12) | + (static_cast(scol3 & 0b00000010) << 11) | + (static_cast(scol4 & 0b00000010) << 10) | + (static_cast(scol5 & 0b00000010) << 9) | + (static_cast(scol6 & 0b00000010) << 8) | + (static_cast(scol7 & 0b00000010) << 7) | + (static_cast(scol8 & 0b00000010) << 6) | + (static_cast(scol9 & 0b00000010) << 5) | + (static_cast(scol10 & 0b00000010) << 4) | + (static_cast(scol11 & 0b00000010) << 3) | + (static_cast(scol12 & 0b00000010) << 2) | + (static_cast(scol13 & 0b00000010) << 1) | + (static_cast(scol14 & 0b00000010) >> 0) | + (static_cast(scol15 & 0b00000010) >> 1); + + const uint16_t row7 = (static_cast(scol0 & 0b00000001) << 15) | + (static_cast(scol1 & 0b00000001) << 14) | + (static_cast(scol2 & 0b00000001) << 13) | + (static_cast(scol3 & 0b00000001) << 12) | + (static_cast(scol4 & 0b00000001) << 11) | + (static_cast(scol5 & 0b00000001) << 10) | + (static_cast(scol6 & 0b00000001) << 9) | + (static_cast(scol7 & 0b00000001) << 8) | + (static_cast(scol8 & 0b00000001) << 7) | + (static_cast(scol9 & 0b00000001) << 6) | + (static_cast(scol10 & 0b00000001) << 5) | + (static_cast(scol11 & 0b00000001) << 4) | + (static_cast(scol12 & 0b00000001) << 3) | + (static_cast(scol13 & 0b00000001) << 2) | + (static_cast(scol14 & 0b00000001) << 1) | + (static_cast(scol15 & 0b00000001) >> 0); + + state[0] = row0; + state[1] = row1; + state[2] = row2; + state[3] = row3; + state[4] = row4; + state[5] = row5; + state[6] = row6; + state[7] = row7; +} +#else static inline void column_substitution(uint16_t* const __restrict state, const uint8_t* const __restrict lut) @@ -445,12 +781,63 @@ column_substitution(uint16_t* const __restrict state, state[6] = row6; state[7] = row7; } +#endif // Right to left convoluted substitution, as described in point (4) of // section 2.3 of Harpocrates specification https://eprint.iacr.org/2022/519.pdf // // Also see figure 7 of above linked document to better understand workings of // this procedure +#if defined HARPOCRATES_PARALLEL +static inline void +right_to_left_convoluted_substitution(uint16_t* const __restrict state, + const sycl::local_ptr lut) +{ +#if defined __clang__ +#pragma unroll 8 +#elif defined __GNUG__ +#pragma GCC ivdep +#pragma GCC unroll 8 +#endif + for (size_t i = 0; i < harpocrates_common::N_ROWS; i++) { + const uint16_t row = state[i]; + + const uint8_t hi = static_cast(row >> 8); + + const uint8_t hi_msb6 = hi << 6; + const uint8_t hi_msb4 = (hi << 4) & 0b11000000; + const uint8_t hi_msb2 = (hi << 2) & 0b11000000; + const uint8_t hi_msb0 = hi & 0b11000000; + + // step 1 + const uint8_t t0 = static_cast(row); + const uint8_t t1 = lut[t0]; + const uint8_t msb6 = t1 & 0b11; + + // step 2 + const uint8_t t2 = hi_msb6 | (t1 >> 2); + const uint8_t t3 = lut[t2]; + const uint8_t msb4 = (t3 & 0b11) << 2; + + // step 3 + const uint8_t t4 = hi_msb4 | (t3 >> 2); + const uint8_t t5 = lut[t4]; + const uint8_t msb2 = (t5 & 0b11) << 4; + + // step 4 + const uint8_t t6 = hi_msb2 | (t5 >> 2); + const uint8_t t7 = lut[t6]; + const uint8_t msb0 = (t7 & 0b11) << 6; + + // step 5 + const uint8_t t8 = hi_msb0 | (t7 >> 2); + const uint8_t t9 = lut[t8]; + + const uint8_t lo = msb0 | msb2 | msb4 | msb6; + state[i] = (static_cast(t9) << 8) | static_cast(lo); + } +} +#else static inline void right_to_left_convoluted_substitution(uint16_t* const __restrict state, const uint8_t* const __restrict lut) @@ -499,5 +886,6 @@ right_to_left_convoluted_substitution(uint16_t* const __restrict state, state[i] = (static_cast(t9) << 8) | static_cast(lo); } } +#endif } From 285a0e308340bffe1b0a9ec8fdff3e76fa739044 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Sat, 14 May 2022 16:27:25 +0530 Subject: [PATCH 2/3] data-parallel harpocrates benchmark results on multi-core CPU & Intel Iris Max GPU Code checked out at commit https://github.com/itzmeanjan/harpocrates/commit/2271291588ffb782537e085c5a675429c564924d Signed-off-by: Anjan Roy --- results/cpu/intel.md | 52 ++++++++++++++++++++++++++++++++++++++++++++ results/gpu/intel.md | 44 +++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/results/cpu/intel.md b/results/cpu/intel.md index 0078c17..11c6b31 100644 --- a/results/cpu/intel.md +++ b/results/cpu/intel.md @@ -1,3 +1,55 @@ +# Benchmarking Harpocrates Cipher on Intel(R) Core(TM) i9-10920X CPU @ 3.50GHz + +Build & offload computation + +```bash +make aot_cpu +``` + +```bash +$ lscpu | grep -i cpu\(s\) # number -of CPUs to offload computation to + +CPU(s): 24 +On-line CPU(s) list: 0-23 +NUMA node0 CPU(s): 0-23 +``` + +```bash +Running on Intel(R) Core(TM) i9-10920X CPU @ 3.50GHz + ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 4.229436 GB/ s|298.232844 MB/ s| 2.879622 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 4.247370 GB/ s|344.488911 MB/ s| 9.666142 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 4.771096 GB/ s|327.086760 MB/ s| 1.556701 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 4.771116 GB/ s|340.449095 MB/ s| 9.326785 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 4.113762 GB/ s|343.731258 MB/ s| 9.102554 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 4.114482 GB/ s|338.225437 MB/ s| 9.274672 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 4.526601 GB/ s|350.803636 MB/ s| 7.235439 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 4.527484 GB/ s|345.359675 MB/ s| 9.368638 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 5.017438 GB/ s|350.834845 MB/ s| 833.204813 MB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 5.017160 GB/ s|345.410328 MB/ s| 8.591400 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 5.432594 GB/ s|351.295495 MB/ s| 5.354879 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 5.432732 GB/ s|345.625445 MB/ s| 7.118138 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 4.999231 GB/ s|351.177205 MB/ s| 6.406842 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 4.999308 GB/ s|344.667218 MB/ s| 9.617469 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +``` + # Benchmarking Harpocrates Cipher on Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz Build & offload computation diff --git a/results/gpu/intel.md b/results/gpu/intel.md index ee1f374..d76ea0a 100644 --- a/results/gpu/intel.md +++ b/results/gpu/intel.md @@ -1,3 +1,47 @@ +# Benchmarking Harpocrates Cipher on Intel(R) Iris(R) Xe MAX Graphics [0x4905] + +Build & offload computation + +```bash +make aot_gpu # keep device identifier argument `0x4905` +``` + +```bash +Running on Intel(R) Iris(R) Xe MAX Graphics [0x4905] + ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 3.484858 GB/ s|132.037404 MB/ s| 5.292577 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 3.486233 GB/ s|132.078098 MB/ s| 5.292577 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 3.507018 GB/ s|132.108437 MB/ s| 5.293696 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 3.507100 GB/ s|132.156384 MB/ s| 5.293136 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 3.508930 GB/ s|132.145855 MB/ s| 5.294092 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 3.508981 GB/ s|132.193602 MB/ s| 5.293346 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 3.510322 GB/ s|132.165268 MB/ s| 5.294349 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 3.510342 GB/ s|132.211182 MB/ s| 5.293568 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 3.510495 GB/ s|132.174153 MB/ s| 5.294460 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 3.510498 GB/ s|132.220301 MB/ s| 5.293906 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 3.510778 GB/ s|132.179018 MB/ s| 5.294533 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 3.510781 GB/ s|132.223493 MB/ s| 5.293830 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 3.511279 GB/ s|132.181247 MB/ s| 5.294560 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 3.511280 GB/ s|132.223371 MB/ s| 5.294034 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ +``` + # Benchmarking Harpocrates Cipher on Intel(R) UHD Graphics P630 Build & offload computation From 2d77f14b484af011ee40fd8b2fec72aadc579ce3 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Sat, 14 May 2022 17:06:47 +0530 Subject: [PATCH 3/3] performance improvements seen on GPU systems, due to changes made in commit https://github.com/itzmeanjan/harpocrates/commit/32c9f06517428ce073fb6d112fec65f600ec52f8 Signed-off-by: Anjan Roy --- results/gpu/intel.md | 90 +++++++++++++++++++++---------------------- results/gpu/nvidia.md | 28 +++++++------- 2 files changed, 59 insertions(+), 59 deletions(-) diff --git a/results/gpu/intel.md b/results/gpu/intel.md index d76ea0a..2ba619e 100644 --- a/results/gpu/intel.md +++ b/results/gpu/intel.md @@ -9,37 +9,37 @@ make aot_gpu # keep device identifier argument `0x4905` ```bash Running on Intel(R) Iris(R) Xe MAX Graphics [0x4905] -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 3.484858 GB/ s|132.037404 MB/ s| 5.292577 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 3.486233 GB/ s|132.078098 MB/ s| 5.292577 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 3.507018 GB/ s|132.108437 MB/ s| 5.293696 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 3.507100 GB/ s|132.156384 MB/ s| 5.293136 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 3.508930 GB/ s|132.145855 MB/ s| 5.294092 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 3.508981 GB/ s|132.193602 MB/ s| 5.293346 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 3.510322 GB/ s|132.165268 MB/ s| 5.294349 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 3.510342 GB/ s|132.211182 MB/ s| 5.293568 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 3.510495 GB/ s|132.174153 MB/ s| 5.294460 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 3.510498 GB/ s|132.220301 MB/ s| 5.293906 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 3.510778 GB/ s|132.179018 MB/ s| 5.294533 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 3.510781 GB/ s|132.223493 MB/ s| 5.293830 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 3.511279 GB/ s|132.181247 MB/ s| 5.294560 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 3.511280 GB/ s|132.223371 MB/ s| 5.294034 GB/ s| -+----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 3.479370 GB/ s|2.311853 GB/ s| 5.265403 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 3.480740 GB/ s|2.331803 GB/ s| 5.277055 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 3.506261 GB/ s|2.342400 GB/ s| 5.293370 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 3.506281 GB/ s|2.367388 GB/ s| 5.279373 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 3.508120 GB/ s|2.359840 GB/ s| 5.293952 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 3.508294 GB/ s|2.384711 GB/ s| 5.293999 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 3.510086 GB/ s|2.367883 GB/ s| 5.294291 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 3.510065 GB/ s|2.393030 GB/ s| 5.293906 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 3.510623 GB/ s|2.372023 GB/ s| 5.294232 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 3.510616 GB/ s|2.397787 GB/ s| 5.293859 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 3.511095 GB/ s|2.374095 GB/ s| 5.294524 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 3.511104 GB/ s|2.400005 GB/ s| 5.293920 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 3.511192 GB/ s|2.375115 GB/ s| 5.294550 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 3.511192 GB/ s|2.401144 GB/ s| 5.294028 GB/ s| ++----------------+-------------------+--------------------+---------------------+------------------+--------------+------------------+ ``` # Benchmarking Harpocrates Cipher on Intel(R) UHD Graphics P630 @@ -56,32 +56,32 @@ Running on Intel(R) UHD Graphics P630 [0x3e96] +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ |# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 8.597729 GB/ s|267.765881 MB/ s| 16.256737 GB/ s| +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 12.277024 GB/ s|321.086067 MB/ s| 16.426964 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 8.598515 GB/ s|268.110738 MB/ s| 16.415505 GB/ s| +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 12.294663 GB/ s|279.986237 MB/ s| 15.798339 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 8.695757 GB/ s|268.338397 MB/ s| 15.930694 GB/ s| +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 14.392542 GB/ s|320.914484 MB/ s| 16.334318 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 8.696962 GB/ s|268.594882 MB/ s| 16.897317 GB/ s| +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 14.454426 GB/ s|283.905231 MB/ s| 16.736576 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 8.679487 GB/ s|269.241631 MB/ s| 16.855711 GB/ s| +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 15.382092 GB/ s|321.055051 MB/ s| 16.971198 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 8.688099 GB/ s|269.201400 MB/ s| 16.876867 GB/ s| +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 15.409793 GB/ s|284.055836 MB/ s| 16.754451 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 8.694904 GB/ s|269.614622 MB/ s| 16.771990 GB/ s| +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 16.232237 GB/ s|321.280723 MB/ s| 16.786572 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 8.694703 GB/ s|269.745347 MB/ s| 16.876489 GB/ s| +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 16.249050 GB/ s|284.501681 MB/ s| 16.713174 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 8.801711 GB/ s|269.672235 MB/ s| 17.005500 GB/ s| +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 16.529387 GB/ s|320.661454 MB/ s| 16.873652 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 8.803615 GB/ s|269.444002 MB/ s| 16.860051 GB/ s| +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 16.531928 GB/ s|284.080429 MB/ s| 16.604272 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 9.204012 GB/ s|269.588275 MB/ s| 16.980957 GB/ s| +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 16.744679 GB/ s|321.133104 MB/ s| 16.881786 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 9.205249 GB/ s|269.092258 MB/ s| 16.951713 GB/ s| +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 16.747100 GB/ s|284.403541 MB/ s| 16.717070 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 10.123275 GB/ s|269.862527 MB/ s| 17.038062 GB/ s| +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 16.919484 GB/ s|321.205007 MB/ s| 16.999118 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ -|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 10.123922 GB/ s|269.396075 MB/ s| 15.748901 GB/ s| +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 16.919770 GB/ s|284.318009 MB/ s| 16.885997 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+----------------+------------------+ ``` diff --git a/results/gpu/nvidia.md b/results/gpu/nvidia.md index a41158e..ba92d40 100644 --- a/results/gpu/nvidia.md +++ b/results/gpu/nvidia.md @@ -12,32 +12,32 @@ Running on Tesla V100-SXM2-16GB +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ |# -of work-items| kernel name|input size ( bytes )|output size ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 9.426423 GB/ s|10.351960 GB/ s| 11.581023 GB/ s| +|1048576 |Harpocrates Encrypt| 16.000244 MB| 16.000000 MB| 9.448642 GB/ s|11.361678 GB/ s| 11.570886 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 9.426810 GB/ s|10.494332 GB/ s| 11.638742 GB/ s| +|1048576 |Harpocrates Decrypt| 16.000244 MB| 16.000000 MB| 9.442242 GB/ s|11.395635 GB/ s| 11.633786 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 10.215561 GB/ s|10.458480 GB/ s| 11.648543 GB/ s| +|2097152 |Harpocrates Encrypt| 32.000244 MB| 32.000000 MB| 10.244583 GB/ s|11.511693 GB/ s| 11.623949 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 10.226479 GB/ s|10.618554 GB/ s| 11.646025 GB/ s| +|2097152 |Harpocrates Decrypt| 32.000244 MB| 32.000000 MB| 10.256899 GB/ s|11.577161 GB/ s| 11.660348 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 10.291292 GB/ s|10.510542 GB/ s| 11.664863 GB/ s| +|4194304 |Harpocrates Encrypt| 64.000244 MB| 64.000000 MB| 10.287567 GB/ s|11.572844 GB/ s| 11.653049 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 10.296363 GB/ s|10.664890 GB/ s| 11.632136 GB/ s| +|4194304 |Harpocrates Decrypt| 64.000244 MB| 64.000000 MB| 10.293358 GB/ s|11.643508 GB/ s| 11.672573 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 10.312413 GB/ s|10.537798 GB/ s| 11.670979 GB/ s| +|8388608 |Harpocrates Encrypt| 128.000244 MB| 128.000000 MB| 10.315321 GB/ s|11.604715 GB/ s| 11.662604 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 10.314698 GB/ s|10.697646 GB/ s| 11.668983 GB/ s| +|8388608 |Harpocrates Decrypt| 128.000244 MB| 128.000000 MB| 10.319584 GB/ s|11.679098 GB/ s| 11.649071 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 10.328942 GB/ s|10.550175 GB/ s| 11.677900 GB/ s| +|16777216 |Harpocrates Encrypt| 256.000244 MB| 256.000000 MB| 10.328733 GB/ s|11.619067 GB/ s| 11.671245 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 10.334467 GB/ s|10.708384 GB/ s| 11.677101 GB/ s| +|16777216 |Harpocrates Decrypt| 256.000244 MB| 256.000000 MB| 10.329880 GB/ s|11.692568 GB/ s| 11.679765 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 10.326854 GB/ s|11.640994 GB/ s| 11.680031 GB/ s| +|33554432 |Harpocrates Encrypt| 512.000244 MB| 512.000000 MB| 10.339888 GB/ s|12.017792 GB/ s| 11.676968 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 10.329666 GB/ s|11.816157 GB/ s| 11.683763 GB/ s| +|33554432 |Harpocrates Decrypt| 512.000244 MB| 512.000000 MB| 10.340515 GB/ s|12.846085 GB/ s| 11.680031 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 10.332478 GB/ s|12.360134 GB/ s| 11.648277 GB/ s| +|67108864 |Harpocrates Encrypt| 1.000000 GB| 1.000000 GB| 10.334355 GB/ s|13.622818 GB/ s| 11.674571 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ -|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 10.333625 GB/ s|12.549789 GB/ s| 11.680031 GB/ s| +|67108864 |Harpocrates Decrypt| 1.000000 GB| 1.000000 GB| 10.334564 GB/ s|13.708900 GB/ s| 11.668186 GB/ s| +----------------+-------------------+--------------------+---------------------+------------------+---------------+------------------+ ```