Skip to content

Commit

Permalink
refs #6: Unify in/out buffer pointers in datablocks.
Browse files Browse the repository at this point in the history
  • Loading branch information
achimnol committed Mar 5, 2016
1 parent 1d78da1 commit b033410
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 47 deletions.
4 changes: 2 additions & 2 deletions elements/ip/IPlookup_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ __global__ void ipv4_route_lookup_cuda(
uint16_t item_idx = item_ids[idx];
struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d];
struct datablock_kernel_arg *db_results = datablocks[dbid_ipv4_lookup_results_d];
uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases_out)[item_idx];
uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases)[item_idx];
uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases)[item_idx];

if (daddr == IGNORED_IP) {
*lookup_result = 0;
Expand Down
14 changes: 7 additions & 7 deletions elements/ipsec/IPsecAES_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -702,19 +702,19 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
const struct datablock_kernel_arg *const db_iv = datablocks[dbid_iv_d];
const struct datablock_kernel_arg *const db_aes_block_info = datablocks[dbid_aes_block_info_d];

assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in);
assert(item_idx < db_aes_block_info->batches[batch_idx].item_count);

uint64_t flow_id = 65536;
const struct aes_block_info &cur_block_info = ((struct aes_block_info *)
db_aes_block_info->batches[batch_idx].buffer_bases_in)
db_aes_block_info->batches[batch_idx].buffer_bases)
[item_idx];
const int pkt_idx = cur_block_info.pkt_idx;
const int block_idx_local = cur_block_info.block_idx;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets[pkt_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes[pkt_idx];

if (cur_block_info.magic == 85739 && pkt_idx < 64 && length != 0) {
flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx];
flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases)[pkt_idx];
if (flow_id != 65536)
assert(flow_id < 1024);
}
Expand Down Expand Up @@ -744,9 +744,9 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
assert(pkt_idx < 64);

const uint8_t *const aes_key = flows[flow_id].aes_key;
uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases_in
uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases
+ (uintptr_t) (16 * pkt_idx));
const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset;
const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases) + offset;
uint4 ecounter = {0,0,0,0};

assert(enc_payload != NULL);
Expand Down
8 changes: 4 additions & 4 deletions elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1250,11 +1250,11 @@ __global__ void computeHMAC_SHA1_3(
const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d];

const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets[item_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes[item_idx];
if (enc_payload_base != NULL && length != 0) {
const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases)[item_idx];
if (flow_id != 65536) {
assert(flow_id < 1024);
const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key;
Expand Down
4 changes: 2 additions & 2 deletions elements/ipv6/LookupIP6Route_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ __global__ void ipv6_route_lookup_cuda(
uint16_t item_idx = item_ids[idx];
struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d];
struct datablock_kernel_arg *db_results = datablocks[dbid_ipv6_lookup_results_d];
struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases_out)[item_idx];
struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases)[item_idx];
uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases)[item_idx];

// NOTE: On FERMI devices, using shared memory to store just 128
// pointers is not necessary since they have on-chip L1
Expand Down
18 changes: 6 additions & 12 deletions include/nba/framework/datablock_shared.hh
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,15 @@
#include <nba/core/shiftedint.hh>

struct alignas(8) datablock_batch_info {
void *buffer_bases_in;
void *buffer_bases_out;
uint32_t item_count_in;
uint32_t item_count_out;
uint16_t *item_sizes_in;
uint16_t *item_sizes_out;
nba::dev_offset_t *item_offsets_in;
nba::dev_offset_t *item_offsets_out;
void *buffer_bases;
uint32_t item_count;
uint16_t *item_sizes;
nba::dev_offset_t *item_offsets;
};

struct alignas(8) datablock_kernel_arg {
uint32_t total_item_count_in;
uint32_t total_item_count_out;
uint16_t item_size_in; // for fixed-size cases
uint16_t item_size_out; // for fixed-size cases
uint32_t total_item_count;
uint16_t item_size; // for fixed-size cases
struct datablock_batch_info batches[0];
};

Expand Down
2 changes: 1 addition & 1 deletion src/engines/cuda/test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ __global__ void dbarg_size_check(size_t *sizes, size_t *offsets)
sizes[0] = sizeof(struct datablock_kernel_arg);
offsets[0] = offsetof(struct datablock_kernel_arg, batches);
sizes[1] = sizeof(struct datablock_batch_info);
offsets[1] = offsetof(struct datablock_batch_info, item_offsets_in);
offsets[1] = offsetof(struct datablock_batch_info, item_offsets);
}

void *nba::get_test_kernel_shiftedint_size_check()
Expand Down
36 changes: 18 additions & 18 deletions src/lib/offloadtask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,7 @@ bool OffloadTask::copy_h2d()
cctx->alloc_input_buffer(io_base, dbarg_size, dbarg_h, dbarg_d);
dbarg = (struct datablock_kernel_arg *) cctx->unwrap_host_buffer(dbarg_h);
dbarray[dbid_d] = (struct datablock_kernel_arg *) cctx->unwrap_device_buffer(dbarg_d);
dbarg->total_item_count_in = 0;
dbarg->total_item_count_out = 0;
dbarg->total_item_count = 0;

// NOTE: To use our "datablock kernel arg" data structures,
// the underlying kernel language must support generic
Expand All @@ -234,29 +233,30 @@ bool OffloadTask::copy_h2d()
* have different lengths. */
//assert(t->aligned_item_sizes_h != nullptr);
uintptr_t base_ptr = (uintptr_t) cctx->unwrap_device_buffer(t->aligned_item_sizes_d);
dbarg->batches[b].item_sizes_in = (uint16_t *)
dbarg->batches[b].item_sizes = (uint16_t *)
(base_ptr + offsetof(struct item_size_info, sizes));
dbarg->batches[b].item_sizes_out = (uint16_t *)
(base_ptr + offsetof(struct item_size_info, sizes));
dbarg->batches[b].item_offsets_in = (dev_offset_t *)
(base_ptr + offsetof(struct item_size_info, offsets));
dbarg->batches[b].item_offsets_out = (dev_offset_t *)
dbarg->batches[b].item_offsets = (dev_offset_t *)
(base_ptr + offsetof(struct item_size_info, offsets));
} else {
/* Same for all batches.
* We assume the module developer knows the fixed length
* when writing device kernel codes. */
dbarg->item_size_in = rri.length;
dbarg->item_size_out = wri.length;
dbarg->batches[b].item_offsets_in = nullptr;
dbarg->batches[b].item_offsets_out = nullptr;
if (rri.type != READ_NONE)
dbarg->item_size = rri.length;
if (wri.type != WRITE_NONE)
dbarg->item_size = wri.length;
dbarg->batches[b].item_offsets = nullptr;
}
if (rri.type != READ_NONE) {
dbarg->batches[b].buffer_bases = cctx->unwrap_device_buffer(t->dev_in_ptr);
dbarg->batches[b].item_count = t->in_count;
dbarg->total_item_count += t->in_count;
}
if (wri.type != WRITE_NONE) {
dbarg->batches[b].buffer_bases = cctx->unwrap_device_buffer(t->dev_out_ptr);
dbarg->batches[b].item_count = t->out_count;
dbarg->total_item_count += t->out_count;
}
dbarg->batches[b].buffer_bases_in = cctx->unwrap_device_buffer(t->dev_in_ptr);
dbarg->batches[b].item_count_in = t->in_count;
dbarg->total_item_count_in += t->in_count;
dbarg->batches[b].buffer_bases_out = cctx->unwrap_device_buffer(t->dev_out_ptr);
dbarg->batches[b].item_count_out = t->out_count;
dbarg->total_item_count_out += t->out_count;
} /* endfor(batches) */
} /* endfor(dbid) */
return true;
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ TEST_P(CUDAStructTest, DatablockArgSizeAlignCheck) {
EXPECT_EQ(sizeof(struct datablock_kernel_arg), output_sizes_h[0]);
EXPECT_EQ(offsetof(struct datablock_kernel_arg, batches), output_offsets_h[0]);
EXPECT_EQ(sizeof(struct datablock_batch_info), output_sizes_h[1]);
EXPECT_EQ(offsetof(struct datablock_batch_info, item_offsets_in), output_offsets_h[1]);
EXPECT_EQ(offsetof(struct datablock_batch_info, item_offsets), output_offsets_h[1]);
ASSERT_EQ(cudaSuccess, cudaFree(output_sizes_d));
ASSERT_EQ(cudaSuccess, cudaFree(output_offsets_d));
ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
Expand Down

0 comments on commit b033410

Please sign in to comment.