refs #6: Reduce memory footprints for batch/item ID arrays

* Replaces them with the new accumulated index calculator.
ANLAB-KAIST · Mar 5, 2016 · 82d0bac · 82d0bac
1 parent 1d78da1
commit 82d0bac
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 94 deletions.
diff --git a/elements/ip/IPlookup_kernel.cu b/elements/ip/IPlookup_kernel.cu
@@ -10,6 +10,8 @@
 
 // includes, project
 #include <cuda.h>
+#include <nba/core/errors.hh>
+#include <nba/core/accumidx.hh>
 #include <nba/engines/cuda/utils.hh>
 #include "IPlookup_kernel.hh"
 
@@ -23,7 +25,7 @@ extern "C" {
 #define dbid_ipv4_dest_addrs_d     (0)
 #define dbid_ipv4_lookup_results_d (1)
 
-__device__ static uint32_t ntohl(uint32_t n)
+__device__ static inline uint32_t ntohl(uint32_t n)
 {
     return ((n & 0xff000000) >> 24) | ((n & 0x00ff0000) >> 8) | \
            ((n & 0x0000ff00) << 8)  | ((n & 0x000000ff) << 24);
@@ -32,16 +34,17 @@ __device__ static uint32_t ntohl(uint32_t n)
 /* The GPU kernel. */
 __global__ void ipv4_route_lookup_cuda(
         struct datablock_kernel_arg **datablocks,
-        uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
+        uint32_t count, uint32_t *item_counts, uint32_t num_batches,
         uint8_t *checkbits_d,
         uint16_t* __restrict__ TBL24_d,
         uint16_t* __restrict__ TBLlong_d)
 {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx < count) {
-        uint8_t batch_idx = batch_ids[idx];
-        uint16_t item_idx = item_ids[idx];
+        uint32_t batch_idx, item_idx;
+        assert(nba::NBA_SUCCESS == nba::get_accum_idx(item_counts, num_batches,
+                                                      idx, batch_idx, item_idx));
         struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d];
         struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv4_lookup_results_d];
         uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];

diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu
@@ -1,18 +1,16 @@
-
+#include <cstdint>
+#include <cassert>
 #include <cuda.h>
 #include <nba/engines/cuda/utils.hh>
-#include "IPsecAES_kernel.hh"
-
-#include <stdint.h>
+#include <nba/core/errors.hh>
+#include <nba/core/accumidx.hh>
+#include <nba/framework/datablock_shared.hh>
 
-#include <assert.h>
-#include <stdio.h>
+#include "IPsecAES_kernel.hh"
 
 #include <openssl/aes.h>
 #include <openssl/md5.h>
 
-#include <nba/framework/datablock_shared.hh>
-
 /* The index is given by the order in get_used_datablocks(). */
 #define dbid_enc_payloads_d   (0)
 #define dbid_flow_ids_d       (1)
@@ -680,7 +678,7 @@ __device__ static void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out,
 
 __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
         struct datablock_kernel_arg **datablocks,
-        uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
+        uint32_t count, uint32_t *item_counts, uint32_t num_batches,
         uint8_t *checkbits_d,
         struct aes_sa_entry* flows
         )
@@ -691,11 +689,13 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
     __shared__ uint32_t shared_Te3[256];
     __shared__ uint32_t shared_Rcon[10];
 
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count && count != 0) {
 
-    const uint8_t batch_idx = batch_ids[idx];
-    const uint16_t item_idx = item_ids[idx];
+    uint32_t batch_idx, item_idx;
+    nba::error_t err;
+    err = nba::get_accum_idx(item_counts, num_batches, idx, batch_idx, item_idx);
+    assert(err == nba::NBA_SUCCESS);
 
     const struct datablock_kernel_arg *db_enc_payloads    = datablocks[dbid_enc_payloads_d];
     const struct datablock_kernel_arg *const db_flow_ids        = datablocks[dbid_flow_ids_d];

diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
@@ -1,12 +1,13 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <assert.h>
+#include <cstdint>
+#include <cassert>
+#include <cuda.h>
 #include <nba/engines/cuda/utils.hh>
+#include <nba/core/errors.hh>
+#include <nba/core/accumidx.hh>
+#include <nba/framework/datablock_shared.hh>
 
 #include "IPsecAuthHMACSHA1_kernel.hh"
 
-#include <nba/framework/datablock_shared.hh>
-
 /* The index is given by the order in get_used_datablocks(). */
 #define dbid_enc_payloads_d (0)
 #define dbid_flow_ids_d     (1)
@@ -1238,15 +1239,17 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets,
 
 __global__ void computeHMAC_SHA1_3(
         struct datablock_kernel_arg **datablocks,
-        uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
+        uint32_t count, uint32_t *item_counts, uint32_t num_batches,
         uint8_t *checkbits_d,
         struct hmac_sa_entry *hmac_key_array)
 {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count && count != 0) {
-        const uint8_t batch_idx = batch_ids[idx];
-        const uint16_t item_idx  = item_ids[idx];
-        assert(item_idx < 64);
+        uint32_t batch_idx, item_idx;
+        nba::error_t err;
+        err = nba::get_accum_idx(item_counts, num_batches, idx, batch_idx, item_idx);
+        assert(err == nba::NBA_SUCCESS);
+
         const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
         const struct datablock_kernel_arg *db_flow_ids     = datablocks[dbid_flow_ids_d];
 

diff --git a/elements/ipv6/LookupIP6Route_kernel.cu b/elements/ipv6/LookupIP6Route_kernel.cu
@@ -4,6 +4,8 @@
 
 #include <cuda.h>
 #include <nba/engines/cuda/utils.hh>
+#include <nba/core/errors.hh>
+#include <nba/core/accumidx.hh>
 #include "LookupIP6Route_kernel.hh"
 
 #include "util_jhash.h"
@@ -144,16 +146,17 @@ __device__ static uint64_t ntohll(uint64_t val)
 
 __global__ void ipv6_route_lookup_cuda(
         struct datablock_kernel_arg **datablocks,
-        uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
+        uint32_t count, uint32_t *item_counts, uint32_t num_batches,
         uint8_t *checkbits_d,
         Item** __restrict__ tables_d,
         size_t* __restrict__ table_sizes_d)
 {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx < count) {
-        uint8_t batch_idx = batch_ids[idx];
-        uint16_t item_idx  = item_ids[idx];
+        uint32_t batch_idx, item_idx;
+        assert(nba::NBA_SUCCESS == nba::get_accum_idx(item_counts, num_batches,
+                                                      idx, batch_idx, item_idx));
         struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d];
         struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv6_lookup_results_d];
         struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];

diff --git a/include/nba/core/accumidx.hh b/include/nba/core/accumidx.hh
@@ -4,14 +4,24 @@
 #include <nba/core/errors.hh>
 #include <type_traits>
 
+#ifndef __CUDACC__
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+#endif
+
 namespace nba {
 
 template<typename T>
-static inline nba::error_t get_accum_idx(const T *group_counts,
-                                         const T num_groups,
-                                         const T global_idx,
-                                         T &group_idx,
-                                         T &item_idx)
+__host__ __device__ static inline nba::error_t get_accum_idx(
+    const T *group_counts,
+    const T num_groups,
+    const T global_idx,
+    T &group_idx,
+    T &item_idx)
 {
     static_assert(std::is_integral<T>::value, "Integer type required.");
     T sum = 0;

diff --git a/include/nba/engines/cuda/computecontext.hh b/include/nba/engines/cuda/computecontext.hh
@@ -27,7 +27,7 @@ public:
     virtual ~CUDAComputeContext();
 
     io_base_t alloc_io_base();
-    int alloc_input_buffer(io_base_t io_base, size_t size, 
+    int alloc_input_buffer(io_base_t io_base, size_t size,
                            host_mem_t &host_ptr, dev_mem_t &dev_ptr);
     int alloc_output_buffer(io_base_t io_base, size_t size,
                             host_mem_t &host_ptr, dev_mem_t &dev_ptr);

diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc
@@ -279,10 +279,8 @@ void OffloadTask::execute()
     int dbid = elem->get_offload_item_counter_dbid();
     DataBlock *db = comp_ctx->datablock_registry[dbid];
 
-    host_mem_t batch_ids_h;
-    dev_mem_t batch_ids_d;
-    host_mem_t item_ids_h;
-    dev_mem_t item_ids_d;
+    host_mem_t item_counts_h;
+    dev_mem_t item_counts_d;
 
     for (PacketBatch *batch : batches) {
         struct datablock_tracker *t = &batch->datablock_states[dbid];
@@ -291,28 +289,17 @@ void OffloadTask::execute()
 
     if (all_item_count > 0) {
 
-        cctx->alloc_input_buffer(io_base, sizeof(uint16_t) * all_item_count,
-                                 batch_ids_h, batch_ids_d);
-        _debug_print_inb("execute.batch_ids", nullptr, 0);
-        cctx->alloc_input_buffer(io_base, sizeof(uint16_t) * all_item_count,
-                                 item_ids_h, item_ids_d);
-        _debug_print_inb("execute.item_ids", nullptr, 0);
-        uint8_t *batch_ids = (uint8_t *) cctx->unwrap_host_buffer(batch_ids_h);
-        uint16_t *item_ids  = (uint16_t *) cctx->unwrap_host_buffer(item_ids_h);
+        cctx->alloc_input_buffer(io_base, sizeof(uint32_t) * batches.size(),
+                                 item_counts_h, item_counts_d);
+        _debug_print_inb("execute.item_counts", nullptr, 0);
+        uint32_t *item_counts = (uint32_t *) cctx->unwrap_host_buffer(item_counts_h);
+        uint32_t num_batches = batches.size();
         res.num_workitems = all_item_count;
         res.num_threads_per_workgroup = elem->get_desired_workgroup_size(cctx->type_name.c_str());
         res.num_workgroups = (all_item_count + res.num_threads_per_workgroup - 1)
                              / res.num_threads_per_workgroup;
-        uint8_t batch_id = 0;
-        unsigned global_idx = 0;
-        for (PacketBatch *batch : batches) {
-            struct datablock_tracker *t = &batch->datablock_states[dbid];
-            for (unsigned item_id = 0; item_id < t->in_count; item_id ++) {
-                batch_ids[global_idx] = batch_id;
-                item_ids[global_idx]  = item_id;
-                global_idx ++;
-            }
-            batch_id ++;
+        for (auto&& pair : enumerate(batches)) {
+            item_counts[pair.first] = (pair.second)->datablock_states[dbid].in_count;
         }
 
         size_t total_input_size = cctx->get_input_size(io_base) - input_begin;
@@ -345,12 +332,11 @@ void OffloadTask::execute()
         arg = {(void *) &all_item_count, sizeof(uint32_t), alignof(uint32_t)};
         cctx->push_kernel_arg(arg);
 
-        ptr_args[1] = cctx->unwrap_device_buffer(batch_ids_d);
+        ptr_args[1] = cctx->unwrap_device_buffer(item_counts_d);
         arg = {&ptr_args[1], sizeof(void *), alignof(void *)};
         cctx->push_kernel_arg(arg);
 
-        ptr_args[2] = cctx->unwrap_device_buffer(item_ids_d);
-        arg = {&ptr_args[2], sizeof(void *), alignof(void *)};
+        arg = {(void *) &num_batches, sizeof(uint32_t), alignof(uint32_t)};
         cctx->push_kernel_arg(arg);
 
         arg = {(void *) &checkbits_d, sizeof(void *), alignof(void *)};

diff --git a/tests/test_ipv4route.cc b/tests/test_ipv4route.cc
@@ -182,31 +182,24 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatch) {
                                       db_arg_size, cudaMemcpyHostToDevice));
     void *dbarray_h[2] = { db_ipv4_dest_addrs_d, db_ipv4_lookup_results_d };
     void *dbarray_d = nullptr;
-    uint8_t batch_ids[count] = { 0, 0 };
-    uint16_t item_ids[count] = { 0, 1 };
-    void *batch_ids_d = nullptr;
-    void *item_ids_d  = nullptr;
+    uint32_t item_counts[num_batches] = { num_pkts };
+    void *item_counts_d = nullptr;
     ASSERT_EQ(cudaSuccess, cudaMalloc(&dbarray_d, sizeof(void*) * 2));
-    ASSERT_EQ(cudaSuccess, cudaMalloc(&batch_ids_d, sizeof(uint8_t) * count));
-    ASSERT_EQ(cudaSuccess, cudaMalloc(&item_ids_d, sizeof(uint16_t) * count));
+    ASSERT_EQ(cudaSuccess, cudaMalloc(&item_counts_d, sizeof(uint32_t) * count));
     ASSERT_NE(nullptr, dbarray_d);
-    ASSERT_NE(nullptr, batch_ids_d);
-    ASSERT_NE(nullptr, item_ids_d);
+    ASSERT_NE(nullptr, item_counts_d);
     ASSERT_EQ(cudaSuccess, cudaMemcpy(dbarray_d, dbarray_h,
                                       sizeof(void*) * 2,
                                       cudaMemcpyHostToDevice));
-    ASSERT_EQ(cudaSuccess, cudaMemcpy(batch_ids_d, batch_ids,
-                                      sizeof(uint8_t) * count,
-                                      cudaMemcpyHostToDevice));
-    ASSERT_EQ(cudaSuccess, cudaMemcpy(item_ids_d, item_ids,
-                                      sizeof(uint16_t) * count,
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(item_counts_d, item_counts,
+                                      sizeof(uint32_t) * num_batches,
                                       cudaMemcpyHostToDevice));
     void *checkbits_d = nullptr;
 
     void *raw_args[7] = {
         &dbarray_d,
         (void *) &num_pkts,
-        &batch_ids_d, &item_ids_d,
+        &item_counts_d, (void *) &num_batches,
         &checkbits_d,
         &tbl24_d, &tbllong_d
     };
@@ -228,8 +221,7 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatch) {
     free(output_buffer);
     ASSERT_EQ(cudaSuccess, cudaFree(input_buffer_d));
     ASSERT_EQ(cudaSuccess, cudaFree(output_buffer_d));
-    ASSERT_EQ(cudaSuccess, cudaFree(batch_ids_d));
-    ASSERT_EQ(cudaSuccess, cudaFree(item_ids_d));
+    ASSERT_EQ(cudaSuccess, cudaFree(item_counts_d));
     ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_dest_addrs_d));
     ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_lookup_results_d));
     ASSERT_EQ(cudaSuccess, cudaFree(dbarray_d));
@@ -356,31 +348,24 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatchWithDatablock) {
                                       db_arg_size, cudaMemcpyHostToDevice));
     void *dbarray_h[2] = { db_ipv4_dest_addrs_d, db_ipv4_lookup_results_d };
     void *dbarray_d = nullptr;
-    uint8_t batch_ids[num_pkts] = { 0, 0 };
-    uint16_t item_ids[num_pkts] = { 0, 1 };
-    void *batch_ids_d = nullptr;
-    void *item_ids_d  = nullptr;
+    uint32_t item_counts[num_batches] = { num_pkts };
+    void *item_counts_d = nullptr;
     ASSERT_EQ(cudaSuccess, cudaMalloc(&dbarray_d, sizeof(void*) * 2));
-    ASSERT_EQ(cudaSuccess, cudaMalloc(&batch_ids_d, sizeof(uint8_t) * num_pkts));
-    ASSERT_EQ(cudaSuccess, cudaMalloc(&item_ids_d, sizeof(uint16_t) * num_pkts));
+    ASSERT_EQ(cudaSuccess, cudaMalloc(&item_counts_d, sizeof(uint32_t) * num_batches));
     ASSERT_NE(nullptr, dbarray_d);
-    ASSERT_NE(nullptr, batch_ids_d);
-    ASSERT_NE(nullptr, item_ids_d);
+    ASSERT_NE(nullptr, item_counts_d);
     ASSERT_EQ(cudaSuccess, cudaMemcpy(dbarray_d, dbarray_h,
                                       sizeof(void*) * 2,
                                       cudaMemcpyHostToDevice));
-    ASSERT_EQ(cudaSuccess, cudaMemcpy(batch_ids_d, batch_ids,
-                                      sizeof(uint8_t) * in_count,
-                                      cudaMemcpyHostToDevice));
-    ASSERT_EQ(cudaSuccess, cudaMemcpy(item_ids_d, item_ids,
-                                      sizeof(uint16_t) * in_count,
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(item_counts_d, item_counts,
+                                      sizeof(uint32_t) * num_batches,
                                       cudaMemcpyHostToDevice));
     void *checkbits_d = nullptr;
 
     void *raw_args[7] = {
         &dbarray_d,
         (void *) &num_pkts,
-        &batch_ids_d, &item_ids_d,
+        &item_counts_d, (void *) &num_batches,
         &checkbits_d,
         &tbl24_d, &tbllong_d
     };
@@ -404,8 +389,7 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatchWithDatablock) {
     free(output_buffer);
     ASSERT_EQ(cudaSuccess, cudaFree(input_buffer_d));
     ASSERT_EQ(cudaSuccess, cudaFree(output_buffer_d));
-    ASSERT_EQ(cudaSuccess, cudaFree(batch_ids_d));
-    ASSERT_EQ(cudaSuccess, cudaFree(item_ids_d));
+    ASSERT_EQ(cudaSuccess, cudaFree(item_counts_d));
     ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_dest_addrs_d));
     ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_lookup_results_d));
     ASSERT_EQ(cudaSuccess, cudaFree(dbarray_d));