Skip to content

Commit

Permalink
refs #6: Reduce memory footprints for batch/item ID arrays
Browse files Browse the repository at this point in the history
 * Replaces them with the new accumulated index calculator.
  • Loading branch information
achimnol committed Mar 5, 2016
1 parent 1d78da1 commit 82d0bac
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 94 deletions.
13 changes: 8 additions & 5 deletions elements/ip/IPlookup_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

// includes, project
#include <cuda.h>
#include <nba/core/errors.hh>
#include <nba/core/accumidx.hh>
#include <nba/engines/cuda/utils.hh>
#include "IPlookup_kernel.hh"

Expand All @@ -23,7 +25,7 @@ extern "C" {
#define dbid_ipv4_dest_addrs_d (0)
#define dbid_ipv4_lookup_results_d (1)

__device__ static uint32_t ntohl(uint32_t n)
__device__ static inline uint32_t ntohl(uint32_t n)
{
return ((n & 0xff000000) >> 24) | ((n & 0x00ff0000) >> 8) | \
((n & 0x0000ff00) << 8) | ((n & 0x000000ff) << 24);
Expand All @@ -32,16 +34,17 @@ __device__ static uint32_t ntohl(uint32_t n)
/* The GPU kernel. */
__global__ void ipv4_route_lookup_cuda(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint32_t count, uint32_t *item_counts, uint32_t num_batches,
uint8_t *checkbits_d,
uint16_t* __restrict__ TBL24_d,
uint16_t* __restrict__ TBLlong_d)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < count) {
uint8_t batch_idx = batch_ids[idx];
uint16_t item_idx = item_ids[idx];
uint32_t batch_idx, item_idx;
assert(nba::NBA_SUCCESS == nba::get_accum_idx(item_counts, num_batches,
idx, batch_idx, item_idx));
struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d];
struct datablock_kernel_arg *db_results = datablocks[dbid_ipv4_lookup_results_d];
uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
Expand Down
24 changes: 12 additions & 12 deletions elements/ipsec/IPsecAES_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@

#include <cstdint>
#include <cassert>
#include <cuda.h>
#include <nba/engines/cuda/utils.hh>
#include "IPsecAES_kernel.hh"

#include <stdint.h>
#include <nba/core/errors.hh>
#include <nba/core/accumidx.hh>
#include <nba/framework/datablock_shared.hh>

#include <assert.h>
#include <stdio.h>
#include "IPsecAES_kernel.hh"

#include <openssl/aes.h>
#include <openssl/md5.h>

#include <nba/framework/datablock_shared.hh>

/* The index is given by the order in get_used_datablocks(). */
#define dbid_enc_payloads_d (0)
#define dbid_flow_ids_d (1)
Expand Down Expand Up @@ -680,7 +678,7 @@ __device__ static void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out,

__global__ void AES_ctr_encrypt_chunk_SharedMem_5(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint32_t count, uint32_t *item_counts, uint32_t num_batches,
uint8_t *checkbits_d,
struct aes_sa_entry* flows
)
Expand All @@ -691,11 +689,13 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
__shared__ uint32_t shared_Te3[256];
__shared__ uint32_t shared_Rcon[10];

int idx = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count && count != 0) {

const uint8_t batch_idx = batch_ids[idx];
const uint16_t item_idx = item_ids[idx];
uint32_t batch_idx, item_idx;
nba::error_t err;
err = nba::get_accum_idx(item_counts, num_batches, idx, batch_idx, item_idx);
assert(err == nba::NBA_SUCCESS);

const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *const db_flow_ids = datablocks[dbid_flow_ids_d];
Expand Down
23 changes: 13 additions & 10 deletions elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <cstdint>
#include <cassert>
#include <cuda.h>
#include <nba/engines/cuda/utils.hh>
#include <nba/core/errors.hh>
#include <nba/core/accumidx.hh>
#include <nba/framework/datablock_shared.hh>

#include "IPsecAuthHMACSHA1_kernel.hh"

#include <nba/framework/datablock_shared.hh>

/* The index is given by the order in get_used_datablocks(). */
#define dbid_enc_payloads_d (0)
#define dbid_flow_ids_d (1)
Expand Down Expand Up @@ -1238,15 +1239,17 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets,

__global__ void computeHMAC_SHA1_3(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint32_t count, uint32_t *item_counts, uint32_t num_batches,
uint8_t *checkbits_d,
struct hmac_sa_entry *hmac_key_array)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count && count != 0) {
const uint8_t batch_idx = batch_ids[idx];
const uint16_t item_idx = item_ids[idx];
assert(item_idx < 64);
uint32_t batch_idx, item_idx;
nba::error_t err;
err = nba::get_accum_idx(item_counts, num_batches, idx, batch_idx, item_idx);
assert(err == nba::NBA_SUCCESS);

const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d];

Expand Down
11 changes: 7 additions & 4 deletions elements/ipv6/LookupIP6Route_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include <cuda.h>
#include <nba/engines/cuda/utils.hh>
#include <nba/core/errors.hh>
#include <nba/core/accumidx.hh>
#include "LookupIP6Route_kernel.hh"

#include "util_jhash.h"
Expand Down Expand Up @@ -144,16 +146,17 @@ __device__ static uint64_t ntohll(uint64_t val)

__global__ void ipv6_route_lookup_cuda(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint32_t count, uint32_t *item_counts, uint32_t num_batches,
uint8_t *checkbits_d,
Item** __restrict__ tables_d,
size_t* __restrict__ table_sizes_d)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < count) {
uint8_t batch_idx = batch_ids[idx];
uint16_t item_idx = item_ids[idx];
uint32_t batch_idx, item_idx;
assert(nba::NBA_SUCCESS == nba::get_accum_idx(item_counts, num_batches,
idx, batch_idx, item_idx));
struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d];
struct datablock_kernel_arg *db_results = datablocks[dbid_ipv6_lookup_results_d];
struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
Expand Down
20 changes: 15 additions & 5 deletions include/nba/core/accumidx.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,24 @@
#include <nba/core/errors.hh>
#include <type_traits>

#ifndef __CUDACC__
#ifndef __host__
#define __host__
#endif
#ifndef __device__
#define __device__
#endif
#endif

namespace nba {

template<typename T>
static inline nba::error_t get_accum_idx(const T *group_counts,
const T num_groups,
const T global_idx,
T &group_idx,
T &item_idx)
__host__ __device__ static inline nba::error_t get_accum_idx(
const T *group_counts,
const T num_groups,
const T global_idx,
T &group_idx,
T &item_idx)
{
static_assert(std::is_integral<T>::value, "Integer type required.");
T sum = 0;
Expand Down
2 changes: 1 addition & 1 deletion include/nba/engines/cuda/computecontext.hh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public:
virtual ~CUDAComputeContext();

io_base_t alloc_io_base();
int alloc_input_buffer(io_base_t io_base, size_t size,
int alloc_input_buffer(io_base_t io_base, size_t size,
host_mem_t &host_ptr, dev_mem_t &dev_ptr);
int alloc_output_buffer(io_base_t io_base, size_t size,
host_mem_t &host_ptr, dev_mem_t &dev_ptr);
Expand Down
36 changes: 11 additions & 25 deletions src/lib/offloadtask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,10 +279,8 @@ void OffloadTask::execute()
int dbid = elem->get_offload_item_counter_dbid();
DataBlock *db = comp_ctx->datablock_registry[dbid];

host_mem_t batch_ids_h;
dev_mem_t batch_ids_d;
host_mem_t item_ids_h;
dev_mem_t item_ids_d;
host_mem_t item_counts_h;
dev_mem_t item_counts_d;

for (PacketBatch *batch : batches) {
struct datablock_tracker *t = &batch->datablock_states[dbid];
Expand All @@ -291,28 +289,17 @@ void OffloadTask::execute()

if (all_item_count > 0) {

cctx->alloc_input_buffer(io_base, sizeof(uint16_t) * all_item_count,
batch_ids_h, batch_ids_d);
_debug_print_inb("execute.batch_ids", nullptr, 0);
cctx->alloc_input_buffer(io_base, sizeof(uint16_t) * all_item_count,
item_ids_h, item_ids_d);
_debug_print_inb("execute.item_ids", nullptr, 0);
uint8_t *batch_ids = (uint8_t *) cctx->unwrap_host_buffer(batch_ids_h);
uint16_t *item_ids = (uint16_t *) cctx->unwrap_host_buffer(item_ids_h);
cctx->alloc_input_buffer(io_base, sizeof(uint32_t) * batches.size(),
item_counts_h, item_counts_d);
_debug_print_inb("execute.item_counts", nullptr, 0);
uint32_t *item_counts = (uint32_t *) cctx->unwrap_host_buffer(item_counts_h);
uint32_t num_batches = batches.size();
res.num_workitems = all_item_count;
res.num_threads_per_workgroup = elem->get_desired_workgroup_size(cctx->type_name.c_str());
res.num_workgroups = (all_item_count + res.num_threads_per_workgroup - 1)
/ res.num_threads_per_workgroup;
uint8_t batch_id = 0;
unsigned global_idx = 0;
for (PacketBatch *batch : batches) {
struct datablock_tracker *t = &batch->datablock_states[dbid];
for (unsigned item_id = 0; item_id < t->in_count; item_id ++) {
batch_ids[global_idx] = batch_id;
item_ids[global_idx] = item_id;
global_idx ++;
}
batch_id ++;
for (auto&& pair : enumerate(batches)) {
item_counts[pair.first] = (pair.second)->datablock_states[dbid].in_count;
}

size_t total_input_size = cctx->get_input_size(io_base) - input_begin;
Expand Down Expand Up @@ -345,12 +332,11 @@ void OffloadTask::execute()
arg = {(void *) &all_item_count, sizeof(uint32_t), alignof(uint32_t)};
cctx->push_kernel_arg(arg);

ptr_args[1] = cctx->unwrap_device_buffer(batch_ids_d);
ptr_args[1] = cctx->unwrap_device_buffer(item_counts_d);
arg = {&ptr_args[1], sizeof(void *), alignof(void *)};
cctx->push_kernel_arg(arg);

ptr_args[2] = cctx->unwrap_device_buffer(item_ids_d);
arg = {&ptr_args[2], sizeof(void *), alignof(void *)};
arg = {(void *) &num_batches, sizeof(uint32_t), alignof(uint32_t)};
cctx->push_kernel_arg(arg);

arg = {(void *) &checkbits_d, sizeof(void *), alignof(void *)};
Expand Down
48 changes: 16 additions & 32 deletions tests/test_ipv4route.cc
Original file line number Diff line number Diff line change
Expand Up @@ -182,31 +182,24 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatch) {
db_arg_size, cudaMemcpyHostToDevice));
void *dbarray_h[2] = { db_ipv4_dest_addrs_d, db_ipv4_lookup_results_d };
void *dbarray_d = nullptr;
uint8_t batch_ids[count] = { 0, 0 };
uint16_t item_ids[count] = { 0, 1 };
void *batch_ids_d = nullptr;
void *item_ids_d = nullptr;
uint32_t item_counts[num_batches] = { num_pkts };
void *item_counts_d = nullptr;
ASSERT_EQ(cudaSuccess, cudaMalloc(&dbarray_d, sizeof(void*) * 2));
ASSERT_EQ(cudaSuccess, cudaMalloc(&batch_ids_d, sizeof(uint8_t) * count));
ASSERT_EQ(cudaSuccess, cudaMalloc(&item_ids_d, sizeof(uint16_t) * count));
ASSERT_EQ(cudaSuccess, cudaMalloc(&item_counts_d, sizeof(uint32_t) * count));
ASSERT_NE(nullptr, dbarray_d);
ASSERT_NE(nullptr, batch_ids_d);
ASSERT_NE(nullptr, item_ids_d);
ASSERT_NE(nullptr, item_counts_d);
ASSERT_EQ(cudaSuccess, cudaMemcpy(dbarray_d, dbarray_h,
sizeof(void*) * 2,
cudaMemcpyHostToDevice));
ASSERT_EQ(cudaSuccess, cudaMemcpy(batch_ids_d, batch_ids,
sizeof(uint8_t) * count,
cudaMemcpyHostToDevice));
ASSERT_EQ(cudaSuccess, cudaMemcpy(item_ids_d, item_ids,
sizeof(uint16_t) * count,
ASSERT_EQ(cudaSuccess, cudaMemcpy(item_counts_d, item_counts,
sizeof(uint32_t) * num_batches,
cudaMemcpyHostToDevice));
void *checkbits_d = nullptr;

void *raw_args[7] = {
&dbarray_d,
(void *) &num_pkts,
&batch_ids_d, &item_ids_d,
&item_counts_d, (void *) &num_batches,
&checkbits_d,
&tbl24_d, &tbllong_d
};
Expand All @@ -228,8 +221,7 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatch) {
free(output_buffer);
ASSERT_EQ(cudaSuccess, cudaFree(input_buffer_d));
ASSERT_EQ(cudaSuccess, cudaFree(output_buffer_d));
ASSERT_EQ(cudaSuccess, cudaFree(batch_ids_d));
ASSERT_EQ(cudaSuccess, cudaFree(item_ids_d));
ASSERT_EQ(cudaSuccess, cudaFree(item_counts_d));
ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_dest_addrs_d));
ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_lookup_results_d));
ASSERT_EQ(cudaSuccess, cudaFree(dbarray_d));
Expand Down Expand Up @@ -356,31 +348,24 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatchWithDatablock) {
db_arg_size, cudaMemcpyHostToDevice));
void *dbarray_h[2] = { db_ipv4_dest_addrs_d, db_ipv4_lookup_results_d };
void *dbarray_d = nullptr;
uint8_t batch_ids[num_pkts] = { 0, 0 };
uint16_t item_ids[num_pkts] = { 0, 1 };
void *batch_ids_d = nullptr;
void *item_ids_d = nullptr;
uint32_t item_counts[num_batches] = { num_pkts };
void *item_counts_d = nullptr;
ASSERT_EQ(cudaSuccess, cudaMalloc(&dbarray_d, sizeof(void*) * 2));
ASSERT_EQ(cudaSuccess, cudaMalloc(&batch_ids_d, sizeof(uint8_t) * num_pkts));
ASSERT_EQ(cudaSuccess, cudaMalloc(&item_ids_d, sizeof(uint16_t) * num_pkts));
ASSERT_EQ(cudaSuccess, cudaMalloc(&item_counts_d, sizeof(uint32_t) * num_batches));
ASSERT_NE(nullptr, dbarray_d);
ASSERT_NE(nullptr, batch_ids_d);
ASSERT_NE(nullptr, item_ids_d);
ASSERT_NE(nullptr, item_counts_d);
ASSERT_EQ(cudaSuccess, cudaMemcpy(dbarray_d, dbarray_h,
sizeof(void*) * 2,
cudaMemcpyHostToDevice));
ASSERT_EQ(cudaSuccess, cudaMemcpy(batch_ids_d, batch_ids,
sizeof(uint8_t) * in_count,
cudaMemcpyHostToDevice));
ASSERT_EQ(cudaSuccess, cudaMemcpy(item_ids_d, item_ids,
sizeof(uint16_t) * in_count,
ASSERT_EQ(cudaSuccess, cudaMemcpy(item_counts_d, item_counts,
sizeof(uint32_t) * num_batches,
cudaMemcpyHostToDevice));
void *checkbits_d = nullptr;

void *raw_args[7] = {
&dbarray_d,
(void *) &num_pkts,
&batch_ids_d, &item_ids_d,
&item_counts_d, (void *) &num_batches,
&checkbits_d,
&tbl24_d, &tbllong_d
};
Expand All @@ -404,8 +389,7 @@ TEST_P(IPLookupCUDAMatchTest, SingleBatchWithDatablock) {
free(output_buffer);
ASSERT_EQ(cudaSuccess, cudaFree(input_buffer_d));
ASSERT_EQ(cudaSuccess, cudaFree(output_buffer_d));
ASSERT_EQ(cudaSuccess, cudaFree(batch_ids_d));
ASSERT_EQ(cudaSuccess, cudaFree(item_ids_d));
ASSERT_EQ(cudaSuccess, cudaFree(item_counts_d));
ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_dest_addrs_d));
ASSERT_EQ(cudaSuccess, cudaFree(db_ipv4_lookup_results_d));
ASSERT_EQ(cudaSuccess, cudaFree(dbarray_d));
Expand Down

0 comments on commit 82d0bac

Please sign in to comment.