refs #6, #8: Use ShiftedInt<uint16_t, 2> instead of uint32_t for offsets

* This restores former performance improvements! (Now ~3.4 Gbps per node with IPsec@64B, previously ~2.6 Gbps)
ANLAB-KAIST · Jan 22, 2016 · f041e78 · f041e78
1 parent 7f99ef1
commit f041e78
Show file tree

Hide file tree

Showing 8 changed files with 30 additions and 17 deletions.
diff --git a/Snakefile b/Snakefile
@@ -109,7 +109,7 @@ if v: CFLAGS += ' -DNBA_RANDOM_PORT_ACCESS'
 # NVIDIA CUDA configurations
 if USE_CUDA:
     CUDA_ARCHS    = compilelib.get_cuda_arch()
-    NVCFLAGS      = '-O2 -g -std=c++11 --use_fast_math -Iinclude -I/usr/local/cuda/include'
+    NVCFLAGS      = '-O2 -g -std=c++11 --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include'
     CFLAGS       += ' -I/usr/local/cuda/include'
     LIBS         += ' -L/usr/local/cuda/lib64 -lcudart' #' -lnvidia-ml'
     print(CUDA_ARCHS)

diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu
@@ -712,7 +712,7 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
                                                   [item_idx];
     const int pkt_idx         = cur_block_info.pkt_idx;
     const int block_idx_local = cur_block_info.block_idx;
-    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx];
+    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value<uintptr_t>();
     const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];
 
     if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) {
@@ -792,4 +792,4 @@ void *nba::ipsec_aes_encryption_get_cuda_kernel() {
     return reinterpret_cast<void *> (AES_ctr_encrypt_chunk_SharedMem_5);
 }
 
-// vim: ts=8 sts=4 sw=4 et
+// vim: ts=8 sts=4 sw=4 et tw=150
diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
@@ -1252,7 +1252,7 @@ __global__ void computeHMAC_SHA1_3(
         const struct datablock_kernel_arg *db_flow_ids     = datablocks[dbid_flow_ids_d];
 
         const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
-        const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx];
+        const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value<uintptr_t>();
         const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
         if (enc_payload_base != NULL && offset != 0 && length != 0) {
             const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
@@ -1277,4 +1277,4 @@ void *nba::ipsec_hsha1_encryption_get_cuda_kernel() {
     return reinterpret_cast<void *> (computeHMAC_SHA1_3);
 }
 
-// vim: ts=8 sts=4 sw=4 et
+// vim: ts=8 sts=4 sw=4 et tw=150
diff --git a/include/nba/core/shiftedint.hh b/include/nba/core/shiftedint.hh
@@ -7,6 +7,15 @@
 #include <cassert>
 #include <exception>
 
+#ifndef __CUDACC__
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+#endif
+
 namespace nba {
 
 #ifdef DEBUG
@@ -28,7 +37,7 @@ template<typename Int, unsigned Shift>
 class ShiftedInt
 {
 private:
-    typedef __uint128_t LARGE_INT;
+    typedef uint64_t LARGE_INT;
     static_assert(std::is_integral<Int>::value, "Integer type required.");
 
     Int shifted_value;
@@ -177,7 +186,7 @@ private:
     }
 
     template<typename ReturnInt>
-    ReturnInt as_value() {
+    __host__ __device__ inline ReturnInt as_value() const {
         static_assert(std::numeric_limits<ReturnInt>::max()
                 >= ((LARGE_INT)std::numeric_limits<Int>::max() << Shift),
                 "return type is not large enough.");

diff --git a/include/nba/engines/cuda/compat.hh b/include/nba/engines/cuda/compat.hh
@@ -7,6 +7,8 @@
  * Note that the nvcc should support C++11 (CUDA v6.5 or higher).
  */
 
+#include <cstdint>
+#include <nba/core/shiftedint.hh>
 #include <nba/framework/config.hh>
 
 struct datablock_batch_info {
@@ -16,8 +18,8 @@ struct datablock_batch_info {
     uint32_t item_count_out;
     uint16_t *item_sizes_in;
     uint16_t *item_sizes_out;
-    uint32_t *item_offsets_in;
-    uint32_t *item_offsets_out;
+    nba::ShiftedInt<uint16_t, 2> *item_offsets_in;
+    nba::ShiftedInt<uint16_t, 2> *item_offsets_out;
 }; // __cuda_aligned
 
 struct datablock_kernel_arg {

diff --git a/include/nba/framework/datablock.hh b/include/nba/framework/datablock.hh
@@ -3,6 +3,7 @@
 
 #include <nba/framework/config.hh>
 #include <nba/framework/computecontext.hh>
+#include <nba/core/shiftedint.hh> // should come after cuda headers
 #include <vector>
 #include <string>
 #include <tuple>
@@ -79,15 +80,15 @@ struct item_size_info {
         uint16_t size;
         uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12];
     };
-    uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
+    ShiftedInt<uint16_t, 2>[NBA_MAX_COMP_BATCH_SIZE * 12];
 };
 #else
 struct item_size_info {
     union {
         uint16_t size;
         uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96];
     };
-    uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
+    ShiftedInt<uint16_t, 2> offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
 };
 #endif
 
@@ -125,8 +126,8 @@ struct datablock_batch_info {
     uint32_t item_count_out;
     uint16_t *item_sizes_in;
     uint16_t *item_sizes_out;
-    uint32_t *item_offsets_in;
-    uint32_t *item_offsets_out;
+    ShiftedInt<uint16_t, 2> *item_offsets_in;
+    ShiftedInt<uint16_t, 2> *item_offsets_out;
 }; // __cuda_aligned
 
 /**

diff --git a/src/lib/datablock.cc b/src/lib/datablock.cc
@@ -202,7 +202,7 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) {
             if (IS_PACKET_INVALID(batch, pkt_idx))
                 continue;
             size_t aligned_elemsz = t->aligned_item_sizes_h->sizes[pkt_idx];
-            size_t offset         = t->aligned_item_sizes_h->offsets[pkt_idx];
+            size_t offset         = t->aligned_item_sizes_h->offsets[pkt_idx].as_value<size_t>();
             rte_memcpy((char*) host_in_buffer + offset,
                        rte_pktmbuf_mtod(batch->packets[pkt_idx], char*) + read_roi.offset,
                        aligned_elemsz);
@@ -250,7 +250,7 @@ void DataBlock::postprocess(OffloadableElement *elem, int input_port, PacketBatc
                                               t->aligned_item_sizes_h->sizes[pkt_idx]);
             size_t offset = bitselect<size_t>(write_roi.type == WRITE_PARTIAL_PACKET,
                                               t->aligned_item_sizes_h->size * pkt_idx,
-                                              t->aligned_item_sizes_h->offsets[pkt_idx]);
+                                              t->aligned_item_sizes_h->offsets[pkt_idx].as_value<size_t>());
             rte_memcpy(rte_pktmbuf_mtod(batch->packets[pkt_idx], char*) + write_roi.offset,
                        (char*) host_out_ptr + offset,
                        elemsz);

diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc
@@ -10,6 +10,7 @@
 #include <nba/engines/cuda/computedevice.hh>
 #include <nba/engines/cuda/computecontext.hh>
 #endif
+#include <nba/core/shiftedint.hh> // should come after cuda headers
 #include <tuple>
 #include <ev.h>
 #include <rte_memcpy.h>
@@ -234,10 +235,10 @@ bool OffloadTask::copy_h2d()
                 dbarg_h->batches[b].item_sizes_out = (uint16_t *)
                         ((char *) t->aligned_item_sizes_d.ptr
                          + (uintptr_t) offsetof(struct item_size_info, sizes));
-                dbarg_h->batches[b].item_offsets_in = (uint32_t *)
+                dbarg_h->batches[b].item_offsets_in = (ShiftedInt<uint16_t, 2> *)
                         ((char *) t->aligned_item_sizes_d.ptr
                          + (uintptr_t) offsetof(struct item_size_info, offsets));
-                dbarg_h->batches[b].item_offsets_out = (uint32_t *)
+                dbarg_h->batches[b].item_offsets_out = (ShiftedInt<uint16_t, 2> *)
                         ((char *) t->aligned_item_sizes_d.ptr
                          + (uintptr_t) offsetof(struct item_size_info, offsets));
             } else {