From b0334109260039b319044dc20fc5b736bf8742f5 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Sat, 5 Mar 2016 21:49:38 +0900 Subject: [PATCH] refs #6: Unify in/out buffer pointers in datablocks. --- elements/ip/IPlookup_kernel.cu | 4 +-- elements/ipsec/IPsecAES_kernel.cu | 14 ++++----- elements/ipsec/IPsecAuthHMACSHA1_kernel.cu | 8 ++--- elements/ipv6/LookupIP6Route_kernel.cu | 4 +-- include/nba/framework/datablock_shared.hh | 18 ++++------- src/engines/cuda/test.cu | 2 +- src/lib/offloadtask.cc | 36 +++++++++++----------- tests/test_cuda.cc | 2 +- 8 files changed, 41 insertions(+), 47 deletions(-) diff --git a/elements/ip/IPlookup_kernel.cu b/elements/ip/IPlookup_kernel.cu index 507a3dd..c2af47e 100644 --- a/elements/ip/IPlookup_kernel.cu +++ b/elements/ip/IPlookup_kernel.cu @@ -44,8 +44,8 @@ __global__ void ipv4_route_lookup_cuda( uint16_t item_idx = item_ids[idx]; struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d]; struct datablock_kernel_arg *db_results = datablocks[dbid_ipv4_lookup_results_d]; - uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx]; - uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases_out)[item_idx]; + uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases)[item_idx]; + uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases)[item_idx]; if (daddr == IGNORED_IP) { *lookup_result = 0; diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu index 712e80b..687bfcc 100644 --- a/elements/ipsec/IPsecAES_kernel.cu +++ b/elements/ipsec/IPsecAES_kernel.cu @@ -702,19 +702,19 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( const struct datablock_kernel_arg *const db_iv = datablocks[dbid_iv_d]; const struct datablock_kernel_arg *const db_aes_block_info = datablocks[dbid_aes_block_info_d]; - assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in); + assert(item_idx < db_aes_block_info->batches[batch_idx].item_count); uint64_t flow_id = 65536; const struct aes_block_info &cur_block_info = ((struct aes_block_info *) - db_aes_block_info->batches[batch_idx].buffer_bases_in) + db_aes_block_info->batches[batch_idx].buffer_bases) [item_idx]; const int pkt_idx = cur_block_info.pkt_idx; const int block_idx_local = cur_block_info.block_idx; - const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value(); - const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]; + const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets[pkt_idx].as_value(); + const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes[pkt_idx]; if (cur_block_info.magic == 85739 && pkt_idx < 64 && length != 0) { - flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx]; + flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases)[pkt_idx]; if (flow_id != 65536) assert(flow_id < 1024); } @@ -744,9 +744,9 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( assert(pkt_idx < 64); const uint8_t *const aes_key = flows[flow_id].aes_key; - uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases_in + uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases + (uintptr_t) (16 * pkt_idx)); - const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset; + const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases) + offset; uint4 ecounter = {0,0,0,0}; assert(enc_payload != NULL); diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu index 5c7311c..1cc36f0 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu +++ b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu @@ -1250,11 +1250,11 @@ __global__ void computeHMAC_SHA1_3( const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d]; const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d]; - const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in; - const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value(); - const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx]; + const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases; + const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets[item_idx].as_value(); + const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes[item_idx]; if (enc_payload_base != NULL && length != 0) { - const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx]; + const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases)[item_idx]; if (flow_id != 65536) { assert(flow_id < 1024); const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key; diff --git a/elements/ipv6/LookupIP6Route_kernel.cu b/elements/ipv6/LookupIP6Route_kernel.cu index bcbe31a..ec56829 100644 --- a/elements/ipv6/LookupIP6Route_kernel.cu +++ b/elements/ipv6/LookupIP6Route_kernel.cu @@ -156,8 +156,8 @@ __global__ void ipv6_route_lookup_cuda( uint16_t item_idx = item_ids[idx]; struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d]; struct datablock_kernel_arg *db_results = datablocks[dbid_ipv6_lookup_results_d]; - struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx]; - uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases_out)[item_idx]; + struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases)[item_idx]; + uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases)[item_idx]; // NOTE: On FERMI devices, using shared memory to store just 128 // pointers is not necessary since they have on-chip L1 diff --git a/include/nba/framework/datablock_shared.hh b/include/nba/framework/datablock_shared.hh index 0e97fb0..032ee11 100644 --- a/include/nba/framework/datablock_shared.hh +++ b/include/nba/framework/datablock_shared.hh @@ -10,21 +10,15 @@ #include struct alignas(8) datablock_batch_info { - void *buffer_bases_in; - void *buffer_bases_out; - uint32_t item_count_in; - uint32_t item_count_out; - uint16_t *item_sizes_in; - uint16_t *item_sizes_out; - nba::dev_offset_t *item_offsets_in; - nba::dev_offset_t *item_offsets_out; + void *buffer_bases; + uint32_t item_count; + uint16_t *item_sizes; + nba::dev_offset_t *item_offsets; }; struct alignas(8) datablock_kernel_arg { - uint32_t total_item_count_in; - uint32_t total_item_count_out; - uint16_t item_size_in; // for fixed-size cases - uint16_t item_size_out; // for fixed-size cases + uint32_t total_item_count; + uint16_t item_size; // for fixed-size cases struct datablock_batch_info batches[0]; }; diff --git a/src/engines/cuda/test.cu b/src/engines/cuda/test.cu index 33b437c..b89db61 100644 --- a/src/engines/cuda/test.cu +++ b/src/engines/cuda/test.cu @@ -29,7 +29,7 @@ __global__ void dbarg_size_check(size_t *sizes, size_t *offsets) sizes[0] = sizeof(struct datablock_kernel_arg); offsets[0] = offsetof(struct datablock_kernel_arg, batches); sizes[1] = sizeof(struct datablock_batch_info); - offsets[1] = offsetof(struct datablock_batch_info, item_offsets_in); + offsets[1] = offsetof(struct datablock_batch_info, item_offsets); } void *nba::get_test_kernel_shiftedint_size_check() diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc index b71fb1e..beed60c 100644 --- a/src/lib/offloadtask.cc +++ b/src/lib/offloadtask.cc @@ -215,8 +215,7 @@ bool OffloadTask::copy_h2d() cctx->alloc_input_buffer(io_base, dbarg_size, dbarg_h, dbarg_d); dbarg = (struct datablock_kernel_arg *) cctx->unwrap_host_buffer(dbarg_h); dbarray[dbid_d] = (struct datablock_kernel_arg *) cctx->unwrap_device_buffer(dbarg_d); - dbarg->total_item_count_in = 0; - dbarg->total_item_count_out = 0; + dbarg->total_item_count = 0; // NOTE: To use our "datablock kernel arg" data structures, // the underlying kernel language must support generic @@ -234,29 +233,30 @@ bool OffloadTask::copy_h2d() * have different lengths. */ //assert(t->aligned_item_sizes_h != nullptr); uintptr_t base_ptr = (uintptr_t) cctx->unwrap_device_buffer(t->aligned_item_sizes_d); - dbarg->batches[b].item_sizes_in = (uint16_t *) + dbarg->batches[b].item_sizes = (uint16_t *) (base_ptr + offsetof(struct item_size_info, sizes)); - dbarg->batches[b].item_sizes_out = (uint16_t *) - (base_ptr + offsetof(struct item_size_info, sizes)); - dbarg->batches[b].item_offsets_in = (dev_offset_t *) - (base_ptr + offsetof(struct item_size_info, offsets)); - dbarg->batches[b].item_offsets_out = (dev_offset_t *) + dbarg->batches[b].item_offsets = (dev_offset_t *) (base_ptr + offsetof(struct item_size_info, offsets)); } else { /* Same for all batches. * We assume the module developer knows the fixed length * when writing device kernel codes. */ - dbarg->item_size_in = rri.length; - dbarg->item_size_out = wri.length; - dbarg->batches[b].item_offsets_in = nullptr; - dbarg->batches[b].item_offsets_out = nullptr; + if (rri.type != READ_NONE) + dbarg->item_size = rri.length; + if (wri.type != WRITE_NONE) + dbarg->item_size = wri.length; + dbarg->batches[b].item_offsets = nullptr; + } + if (rri.type != READ_NONE) { + dbarg->batches[b].buffer_bases = cctx->unwrap_device_buffer(t->dev_in_ptr); + dbarg->batches[b].item_count = t->in_count; + dbarg->total_item_count += t->in_count; + } + if (wri.type != WRITE_NONE) { + dbarg->batches[b].buffer_bases = cctx->unwrap_device_buffer(t->dev_out_ptr); + dbarg->batches[b].item_count = t->out_count; + dbarg->total_item_count += t->out_count; } - dbarg->batches[b].buffer_bases_in = cctx->unwrap_device_buffer(t->dev_in_ptr); - dbarg->batches[b].item_count_in = t->in_count; - dbarg->total_item_count_in += t->in_count; - dbarg->batches[b].buffer_bases_out = cctx->unwrap_device_buffer(t->dev_out_ptr); - dbarg->batches[b].item_count_out = t->out_count; - dbarg->total_item_count_out += t->out_count; } /* endfor(batches) */ } /* endfor(dbid) */ return true; diff --git a/tests/test_cuda.cc b/tests/test_cuda.cc index b8e7f66..35396a6 100644 --- a/tests/test_cuda.cc +++ b/tests/test_cuda.cc @@ -103,7 +103,7 @@ TEST_P(CUDAStructTest, DatablockArgSizeAlignCheck) { EXPECT_EQ(sizeof(struct datablock_kernel_arg), output_sizes_h[0]); EXPECT_EQ(offsetof(struct datablock_kernel_arg, batches), output_offsets_h[0]); EXPECT_EQ(sizeof(struct datablock_batch_info), output_sizes_h[1]); - EXPECT_EQ(offsetof(struct datablock_batch_info, item_offsets_in), output_offsets_h[1]); + EXPECT_EQ(offsetof(struct datablock_batch_info, item_offsets), output_offsets_h[1]); ASSERT_EQ(cudaSuccess, cudaFree(output_sizes_d)); ASSERT_EQ(cudaSuccess, cudaFree(output_offsets_d)); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());