From b0334109260039b319044dc20fc5b736bf8742f5 Mon Sep 17 00:00:00 2001
From: Joongi Kim <joongi@an.kaist.ac.kr>
Date: Sat, 5 Mar 2016 21:49:38 +0900
Subject: [PATCH] refs #6: Unify in/out buffer pointers in datablocks.

---
 elements/ip/IPlookup_kernel.cu             |  4 +--
 elements/ipsec/IPsecAES_kernel.cu          | 14 ++++-----
 elements/ipsec/IPsecAuthHMACSHA1_kernel.cu |  8 ++---
 elements/ipv6/LookupIP6Route_kernel.cu     |  4 +--
 include/nba/framework/datablock_shared.hh  | 18 ++++-------
 src/engines/cuda/test.cu                   |  2 +-
 src/lib/offloadtask.cc                     | 36 +++++++++++-----------
 tests/test_cuda.cc                         |  2 +-
 8 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/elements/ip/IPlookup_kernel.cu b/elements/ip/IPlookup_kernel.cu
index 507a3dd..c2af47e 100644
--- a/elements/ip/IPlookup_kernel.cu
+++ b/elements/ip/IPlookup_kernel.cu
@@ -44,8 +44,8 @@ __global__ void ipv4_route_lookup_cuda(
         uint16_t item_idx = item_ids[idx];
         struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d];
         struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv4_lookup_results_d];
-        uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
-        uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases_out)[item_idx];
+        uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases)[item_idx];
+        uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases)[item_idx];
 
         if (daddr == IGNORED_IP) {
             *lookup_result = 0;
diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu
index 712e80b..687bfcc 100644
--- a/elements/ipsec/IPsecAES_kernel.cu
+++ b/elements/ipsec/IPsecAES_kernel.cu
@@ -702,19 +702,19 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
     const struct datablock_kernel_arg *const db_iv              = datablocks[dbid_iv_d];
     const struct datablock_kernel_arg *const db_aes_block_info  = datablocks[dbid_aes_block_info_d];
 
-    assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in);
+    assert(item_idx < db_aes_block_info->batches[batch_idx].item_count);
 
     uint64_t flow_id = 65536;
     const struct aes_block_info &cur_block_info = ((struct aes_block_info *)
-                                                   db_aes_block_info->batches[batch_idx].buffer_bases_in)
+                                                   db_aes_block_info->batches[batch_idx].buffer_bases)
                                                   [item_idx];
     const int pkt_idx         = cur_block_info.pkt_idx;
     const int block_idx_local = cur_block_info.block_idx;
-    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value<uintptr_t>();
-    const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];
+    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets[pkt_idx].as_value<uintptr_t>();
+    const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes[pkt_idx];
 
     if (cur_block_info.magic == 85739 && pkt_idx < 64 && length != 0) {
-        flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx];
+        flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases)[pkt_idx];
         if (flow_id != 65536)
             assert(flow_id < 1024);
     }
@@ -744,9 +744,9 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
         assert(pkt_idx < 64);
 
         const uint8_t *const aes_key = flows[flow_id].aes_key;
-        uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases_in
+        uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases
                        + (uintptr_t) (16 * pkt_idx));
-        const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset;
+        const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases) + offset;
         uint4 ecounter = {0,0,0,0};
 
         assert(enc_payload != NULL);
diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
index 5c7311c..1cc36f0 100644
--- a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
+++ b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
@@ -1250,11 +1250,11 @@ __global__ void computeHMAC_SHA1_3(
         const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
         const struct datablock_kernel_arg *db_flow_ids     = datablocks[dbid_flow_ids_d];
 
-        const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
-        const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value<uintptr_t>();
-        const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
+        const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases;
+        const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets[item_idx].as_value<uintptr_t>();
+        const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes[item_idx];
         if (enc_payload_base != NULL && length != 0) {
-            const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
+            const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases)[item_idx];
             if (flow_id != 65536) {
                 assert(flow_id < 1024);
                 const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key;
diff --git a/elements/ipv6/LookupIP6Route_kernel.cu b/elements/ipv6/LookupIP6Route_kernel.cu
index bcbe31a..ec56829 100644
--- a/elements/ipv6/LookupIP6Route_kernel.cu
+++ b/elements/ipv6/LookupIP6Route_kernel.cu
@@ -156,8 +156,8 @@ __global__ void ipv6_route_lookup_cuda(
         uint16_t item_idx  = item_ids[idx];
         struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d];
         struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv6_lookup_results_d];
-        struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
-        uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases_out)[item_idx];
+        struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases)[item_idx];
+        uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases)[item_idx];
 
         // NOTE: On FERMI devices, using shared memory to store just 128
         //   pointers is not necessary since they have on-chip L1
diff --git a/include/nba/framework/datablock_shared.hh b/include/nba/framework/datablock_shared.hh
index 0e97fb0..032ee11 100644
--- a/include/nba/framework/datablock_shared.hh
+++ b/include/nba/framework/datablock_shared.hh
@@ -10,21 +10,15 @@
 #include <nba/core/shiftedint.hh>
 
 struct alignas(8) datablock_batch_info {
-    void *buffer_bases_in;
-    void *buffer_bases_out;
-    uint32_t item_count_in;
-    uint32_t item_count_out;
-    uint16_t *item_sizes_in;
-    uint16_t *item_sizes_out;
-    nba::dev_offset_t *item_offsets_in;
-    nba::dev_offset_t *item_offsets_out;
+    void *buffer_bases;
+    uint32_t item_count;
+    uint16_t *item_sizes;
+    nba::dev_offset_t *item_offsets;
 };
 
 struct alignas(8) datablock_kernel_arg {
-    uint32_t total_item_count_in;
-    uint32_t total_item_count_out;
-    uint16_t item_size_in;  // for fixed-size cases
-    uint16_t item_size_out; // for fixed-size cases
+    uint32_t total_item_count;
+    uint16_t item_size;  // for fixed-size cases
     struct datablock_batch_info batches[0];
 };
 
diff --git a/src/engines/cuda/test.cu b/src/engines/cuda/test.cu
index 33b437c..b89db61 100644
--- a/src/engines/cuda/test.cu
+++ b/src/engines/cuda/test.cu
@@ -29,7 +29,7 @@ __global__ void dbarg_size_check(size_t *sizes, size_t *offsets)
     sizes[0] = sizeof(struct datablock_kernel_arg);
     offsets[0] = offsetof(struct datablock_kernel_arg, batches);
     sizes[1] = sizeof(struct datablock_batch_info);
-    offsets[1] = offsetof(struct datablock_batch_info, item_offsets_in);
+    offsets[1] = offsetof(struct datablock_batch_info, item_offsets);
 }
 
 void *nba::get_test_kernel_shiftedint_size_check()
diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc
index b71fb1e..beed60c 100644
--- a/src/lib/offloadtask.cc
+++ b/src/lib/offloadtask.cc
@@ -215,8 +215,7 @@ bool OffloadTask::copy_h2d()
         cctx->alloc_input_buffer(io_base, dbarg_size, dbarg_h, dbarg_d);
         dbarg = (struct datablock_kernel_arg *) cctx->unwrap_host_buffer(dbarg_h);
         dbarray[dbid_d] = (struct datablock_kernel_arg *) cctx->unwrap_device_buffer(dbarg_d);
-        dbarg->total_item_count_in  = 0;
-        dbarg->total_item_count_out = 0;
+        dbarg->total_item_count  = 0;
 
         // NOTE: To use our "datablock kernel arg" data structures,
         //       the underlying kernel language must support generic
@@ -234,29 +233,30 @@ bool OffloadTask::copy_h2d()
                  * have different lengths. */
                 //assert(t->aligned_item_sizes_h != nullptr);
                 uintptr_t base_ptr = (uintptr_t) cctx->unwrap_device_buffer(t->aligned_item_sizes_d);
-                dbarg->batches[b].item_sizes_in  = (uint16_t *)
+                dbarg->batches[b].item_sizes  = (uint16_t *)
                         (base_ptr + offsetof(struct item_size_info, sizes));
-                dbarg->batches[b].item_sizes_out = (uint16_t *)
-                        (base_ptr + offsetof(struct item_size_info, sizes));
-                dbarg->batches[b].item_offsets_in = (dev_offset_t *)
-                        (base_ptr + offsetof(struct item_size_info, offsets));
-                dbarg->batches[b].item_offsets_out = (dev_offset_t *)
+                dbarg->batches[b].item_offsets = (dev_offset_t *)
                         (base_ptr + offsetof(struct item_size_info, offsets));
             } else {
                 /* Same for all batches.
                  * We assume the module developer knows the fixed length
                  * when writing device kernel codes. */
-                dbarg->item_size_in  = rri.length;
-                dbarg->item_size_out = wri.length;
-                dbarg->batches[b].item_offsets_in  = nullptr;
-                dbarg->batches[b].item_offsets_out = nullptr;
+                if (rri.type != READ_NONE)
+                    dbarg->item_size  = rri.length;
+                if (wri.type != WRITE_NONE)
+                    dbarg->item_size = wri.length;
+                dbarg->batches[b].item_offsets  = nullptr;
+            }
+            if (rri.type != READ_NONE) {
+                dbarg->batches[b].buffer_bases = cctx->unwrap_device_buffer(t->dev_in_ptr);
+                dbarg->batches[b].item_count   = t->in_count;
+                dbarg->total_item_count       += t->in_count;
+            }
+            if (wri.type != WRITE_NONE) {
+                dbarg->batches[b].buffer_bases = cctx->unwrap_device_buffer(t->dev_out_ptr);
+                dbarg->batches[b].item_count   = t->out_count;
+                dbarg->total_item_count       += t->out_count;
             }
-            dbarg->batches[b].buffer_bases_in = cctx->unwrap_device_buffer(t->dev_in_ptr);
-            dbarg->batches[b].item_count_in   = t->in_count;
-            dbarg->total_item_count_in       += t->in_count;
-            dbarg->batches[b].buffer_bases_out = cctx->unwrap_device_buffer(t->dev_out_ptr);
-            dbarg->batches[b].item_count_out   = t->out_count;
-            dbarg->total_item_count_out       += t->out_count;
         } /* endfor(batches) */
     } /* endfor(dbid) */
     return true;
diff --git a/tests/test_cuda.cc b/tests/test_cuda.cc
index b8e7f66..35396a6 100644
--- a/tests/test_cuda.cc
+++ b/tests/test_cuda.cc
@@ -103,7 +103,7 @@ TEST_P(CUDAStructTest, DatablockArgSizeAlignCheck) {
     EXPECT_EQ(sizeof(struct datablock_kernel_arg), output_sizes_h[0]);
     EXPECT_EQ(offsetof(struct datablock_kernel_arg, batches), output_offsets_h[0]);
     EXPECT_EQ(sizeof(struct datablock_batch_info), output_sizes_h[1]);
-    EXPECT_EQ(offsetof(struct datablock_batch_info, item_offsets_in), output_offsets_h[1]);
+    EXPECT_EQ(offsetof(struct datablock_batch_info, item_offsets), output_offsets_h[1]);
     ASSERT_EQ(cudaSuccess, cudaFree(output_sizes_d));
     ASSERT_EQ(cudaSuccess, cudaFree(output_offsets_d));
     ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());