refs #6: Optimize again and fix intenger out-of-range bug

* Removes "expect_false" on EVRUN_NOWAIT in libev's ev_run() code. It cuts down the CPU cycle usage by ev_run() to half! (Still, our performance bottleneck is not on the libev itself. When we observe high CPU percentage on libev, it means that the CPU is wasting its cycles.) * Limits NBA_MAX_IO_BASES from 17 to 1. This reduces the size of memory area used by both CPU and GPU. It meas that we now have bottlenecks in memory/cache subsystems. - Adds a blocking ev_run() call to wait io_bases to become available, using the same technique for waiting batch pools: ev_run() <=> ev_break() pairs - This provides performance improvements. * Increases offset sizes from uint16_t to uint32_t for when running IPsec with MTU-sized packets, where offsets may exceed 65535. This has been the main reason of frequent errors when running IPsec with large-size packets (>= 512 bytes). - This decreases performance. => Above two performance improvements/degradation compensate each other. So there is no performance change compared to the previous commit. * Reduces memory footprint by using variable-sized array in datablock arguments. However, this does not yield significant performance changes because we already have "full" aggregated batches when offloading IPsec encryption due to computation/memory bottlenecks.
ANLAB-KAIST · Jan 18, 2016 · 17e0217 · 17e0217
1 parent d81c5f9
commit 17e0217
Show file tree

Hide file tree

Showing 16 changed files with 165 additions and 123 deletions.
diff --git a/3rdparty/libev/ev.c b/3rdparty/libev/ev.c
@@ -3604,7 +3604,8 @@ ev_run (EV_P_ int flags)
 
         ECB_MEMORY_FENCE; /* make sure pipe_write_wanted is visible before we check for potential skips */
 
-        if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped)))
+        //if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped)))
+        if (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped))
           {
             waittime = MAX_BLOCKTIME;
 

diff --git a/elements/ip/IPlookup_kernel.cu b/elements/ip/IPlookup_kernel.cu
@@ -32,7 +32,7 @@ __device__ uint32_t ntohl(uint32_t n)
 
 /* The GPU kernel. */
 __global__ void ipv4_route_lookup_cuda(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         uint16_t* __restrict__ TBL24_d,
@@ -43,10 +43,10 @@ __global__ void ipv4_route_lookup_cuda(
     if (idx < count) {
         uint16_t batch_idx = batch_ids[idx];
         uint16_t item_idx  = item_ids[idx];
-        struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv4_dest_addrs_d];
-        struct datablock_kernel_arg *db_results    = &datablocks[dbid_ipv4_lookup_results_d];
-        uint32_t daddr = ((uint32_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx];
-        uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx];
+        struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d];
+        struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv4_lookup_results_d];
+        uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
+        uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases_out)[item_idx];
 
         if (daddr == IGNORED_IP) {
             *lookup_result = 0;

diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu
@@ -680,7 +680,7 @@ __device__ void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out,
 }
 
 __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         struct aes_sa_entry* flow_info
@@ -698,25 +698,25 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
     const uint16_t batch_idx = batch_ids[idx];
     const uint16_t item_idx  = item_ids[idx];
 
-    const struct datablock_kernel_arg *db_enc_payloads    = &datablocks[dbid_enc_payloads_d];
-    const struct datablock_kernel_arg *db_iv              = &datablocks[dbid_iv_d];
-    const struct datablock_kernel_arg *db_flow_ids        = &datablocks[dbid_flow_ids_d];
-    const struct datablock_kernel_arg *db_aes_block_info  = &datablocks[dbid_aes_block_info_d];
+    const struct datablock_kernel_arg *db_enc_payloads    = datablocks[dbid_enc_payloads_d];
+    const struct datablock_kernel_arg *db_iv              = datablocks[dbid_iv_d];
+    const struct datablock_kernel_arg *db_flow_ids        = datablocks[dbid_flow_ids_d];
+    const struct datablock_kernel_arg *db_aes_block_info  = datablocks[dbid_aes_block_info_d];
 
     assert(batch_idx < 32);
-    assert(item_idx < db_aes_block_info->item_count_in[batch_idx]);
+    assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in);
 
     uint64_t flow_id = 65536;
     const struct aes_block_info cur_block_info = ((struct aes_block_info *)
-                                                   db_aes_block_info->buffer_bases_in[batch_idx])
+                                                   db_aes_block_info->batches[batch_idx].buffer_bases_in)
                                                   [item_idx];
     const int pkt_idx         = cur_block_info.pkt_idx;
     const int block_idx_local = cur_block_info.block_idx;
-    const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][pkt_idx];
-    const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][pkt_idx];
+    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx];
+    const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];
 
     if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) {
-        flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[pkt_idx];
+        flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx];
         if (flow_id != 65536)
             assert(flow_id < 1024);
     }
@@ -730,11 +730,11 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
     if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64) {
 
         aes_key = flow_info[flow_id].aes_key;
-        iv = ((uint4 *) db_iv->buffer_bases_in[batch_idx])[pkt_idx];
+        iv = ((uint4 *) db_iv->batches[batch_idx].buffer_bases_in)[pkt_idx];
 
         if (offset != 0 && length != 0) {
 
-            enc_payload = ((uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx]) + offset;
+            enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset;
 
             /* Step 2. (marginal) */
             for (int i = 0; i * blockDim.x < 256; i++) {
@@ -773,7 +773,8 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
 
         /* Step 5: XOR the plain text (in-place). */
         uint4 *in_blk = (uint4 *) &enc_payload[block_idx_local * AES_BLOCK_SIZE];
-        assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= enc_payload + db_enc_payloads->item_sizes_in[batch_idx][pkt_idx]);
+        assert((uint8_t*)in_blk + AES_BLOCK_SIZE <=
+               enc_payload + db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]);
         (*in_blk).x = ecounter.x ^ (*in_blk).x;
         (*in_blk).y = ecounter.y ^ (*in_blk).y;
         (*in_blk).z = ecounter.z ^ (*in_blk).z;

diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
@@ -1238,7 +1238,7 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets,
 #endif
 
 __global__ void computeHMAC_SHA1_3(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         struct hmac_sa_entry *hmac_key_array)
@@ -1248,14 +1248,14 @@ __global__ void computeHMAC_SHA1_3(
         const uint16_t batch_idx = batch_ids[idx];
         const uint16_t item_idx  = item_ids[idx];
         assert(item_idx < 64);
-        const struct datablock_kernel_arg *db_enc_payloads = &datablocks[dbid_enc_payloads_d];
-        const struct datablock_kernel_arg *db_flow_ids     = &datablocks[dbid_flow_ids_d];
+        const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
+        const struct datablock_kernel_arg *db_flow_ids     = datablocks[dbid_flow_ids_d];
 
-        const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx];
-        const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][item_idx];
-        const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][item_idx];
+        const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
+        const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx];
+        const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
         if (enc_payload_base != NULL && offset != 0 && length != 0) {
-            const uint64_t flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[item_idx];
+            const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
             if (flow_id != 65536 && flow_id < 1024) {
                 //assert(flow_id < 1024);
                 const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key;

diff --git a/elements/ipv6/LookupIP6Route_kernel.cu b/elements/ipv6/LookupIP6Route_kernel.cu
@@ -143,7 +143,7 @@ __device__ uint64_t ntohll(uint64_t val)
 }
 
 __global__ void ipv6_route_lookup_cuda(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         Item** __restrict__ tables_d,
@@ -154,10 +154,10 @@ __global__ void ipv6_route_lookup_cuda(
     if (idx < count) {
         uint16_t batch_idx = batch_ids[idx];
         uint16_t item_idx  = item_ids[idx];
-        struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv6_dest_addrs_d];
-        struct datablock_kernel_arg *db_results    = &datablocks[dbid_ipv6_lookup_results_d];
-        struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx];
-        uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx];
+        struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d];
+        struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv6_lookup_results_d];
+        struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
+        uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases_out)[item_idx];
 
         // NOTE: On FERMI devices, using shared memory to store just 128
         //   pointers is not necessary since they have on-chip L1

diff --git a/include/nba/engines/cuda/compat.hh b/include/nba/engines/cuda/compat.hh
@@ -9,23 +9,23 @@
 
 #include <nba/framework/config.hh>
 
+struct datablock_batch_info {
+    void *buffer_bases_in;
+    void *buffer_bases_out;
+    uint32_t item_count_in;
+    uint32_t item_count_out;
+    uint16_t *item_sizes_in;
+    uint16_t *item_sizes_out;
+    uint32_t *item_offsets_in;
+    uint32_t *item_offsets_out;
+}; // __cuda_aligned
+
 struct datablock_kernel_arg {
     uint32_t total_item_count_in;
     uint32_t total_item_count_out;
-    void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH];
-    void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
-    union {
-        uint16_t item_size_in;
-        uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
-    };
-    union {
-        uint16_t item_size_out;
-        uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
-    };
-    uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
-    uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
-};
+    uint16_t item_size_in;  // for fixed-size cases
+    uint16_t item_size_out; // for fixed-size cases
+    struct datablock_batch_info batches[0];
+}; // __cuda_aligned
 
 #endif
diff --git a/include/nba/framework/config.hh b/include/nba/framework/config.hh
@@ -47,7 +47,7 @@
 
 #define NBA_MAX_TASKPOOL_SIZE       (2048u)
 #define NBA_MAX_BATCHPOOL_SIZE      (2048u)
-#define NBA_MAX_IO_BASES            (17)
+#define NBA_MAX_IO_BASES            (1)
 
 #define NBA_MAX_ANNOTATION_SET_SIZE         (7)
 #define NBA_MAX_NODELOCALSTORAGE_ENTRIES    (16)

diff --git a/include/nba/framework/datablock.hh b/include/nba/framework/datablock.hh
@@ -79,22 +79,24 @@ struct item_size_info {
         uint16_t size;
         uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12];
     };
-    uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
+    uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
 };
 #else
 struct item_size_info {
     union {
         uint16_t size;
         uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96];
     };
-    uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
+    uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
 };
 #endif
 
-/** Datablock tracking struct.
+/**
+ * Datablock tracking struct.
  *
- * It resides in PacketBatch as a static array, and keeps track of the
- * status of data blocks attached to the batch.
+ * It contains information required to reuse device buffers that are
+ * already copied to the device.  It resides in PacketBatch as a static
+ * array.
  */
 struct datablock_tracker {
     void *host_in_ptr;
@@ -105,30 +107,38 @@ struct datablock_tracker {
     size_t in_count;
     size_t out_size;
     size_t out_count;
-    //struct item_size_info exact_item_sizes;
     struct item_size_info *aligned_item_sizes_h;
     memory_t aligned_item_sizes_d;
 };
 
-/* NOTE: The alignment of this struct should match with CUDA. */
+/**
+ * Datablock batch info struct.
+ *
+ * It contains item offset/size information for variable-length datablocks.
+ *
+ * NOTE: The alignment of this struct should match with CUDA.
+ */
+struct datablock_batch_info {
+    void *buffer_bases_in;
+    void *buffer_bases_out;
+    uint32_t item_count_in;
+    uint32_t item_count_out;
+    uint16_t *item_sizes_in;
+    uint16_t *item_sizes_out;
+    uint32_t *item_offsets_in;
+    uint32_t *item_offsets_out;
+}; // __cuda_aligned
+
+/**
+ * Kernel argument spec for datablocks.
+ */
 struct datablock_kernel_arg {
     uint32_t total_item_count_in;
     uint32_t total_item_count_out;
-    void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH];
-    void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
-    union {
-        uint16_t item_size_in;
-        uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
-    };
-    union {
-        uint16_t item_size_out;
-        uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
-    };
-    uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
-    uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
-}; // __attribute__((aligned(8)));
+    uint16_t item_size_in;  // for fixed-size cases
+    uint16_t item_size_out; // for fixed-size cases
+    struct datablock_batch_info batches[0];
+}; // __cuda_aligned
 
 
 /** Datablock information class.
@@ -165,7 +175,7 @@ public:
     void preprocess(PacketBatch *batch, void *host_ptr);
     void postprocess(OffloadableElement *elem, int input_port, PacketBatch *batch, void *host_ptr);
 
-    /* Below methods arre used only when ROI type is USER_PREPROC/USER_POSTPROC. */
+    /* Below methods are used only when ROI type is USER_PREPROC/USER_POSTPROC. */
     virtual void calculate_read_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count)
     { out_bytes = 0; out_count = 0; }
     virtual void calculate_write_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count)

diff --git a/include/nba/framework/elementgraph.hh b/include/nba/framework/elementgraph.hh
@@ -46,6 +46,7 @@ public:
     /* Inserts the given batch/offloadtask to the internal task queue.
      * This does not execute the pipeline; call flush_tasks() for that. */
     void enqueue_batch(PacketBatch *batch, Element *start_elem, int input_port = 0);
+    void enqueue_offload_task(OffloadTask *otask, OffloadableElement *start_elem, int input_port = 0);
     void enqueue_offload_task(OffloadTask *otask, Element *start_elem, int input_port = 0);
 
     /* Tries to run all pending computation tasks. */

diff --git a/include/nba/framework/offloadtask.hh b/include/nba/framework/offloadtask.hh
@@ -58,8 +58,7 @@ public:
     void notify_completion();
 
 public:
-    /* Initialized during execute(). */
-    struct resource_param res;
+    struct resource_param res; /* Initialized during execute(). */
     uint64_t offload_start;
     double offload_cost;
     size_t num_pkts;
@@ -80,7 +79,7 @@ public:
     OffloadableElement* elem;
     int dbid_h2d[NBA_MAX_DATABLOCKS];
 
-    struct datablock_kernel_arg *dbarray_h;
+    struct datablock_kernel_arg **dbarray_h;
     memory_t dbarray_d;
 
     struct ev_async *completion_watcher __cache_aligned;

diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc
@@ -12,7 +12,7 @@ struct cuda_event_context {
     void *user_arg;
 };
 
-#define IO_BASE_SIZE (4 * 1024 * 1024)
+#define IO_BASE_SIZE (16 * 1024 * 1024)
 #undef USE_PHYS_CONT_MEMORY // performance degraded :(
 
 CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother)

diff --git a/src/lib/coprocessor.cc b/src/lib/coprocessor.cc
@@ -165,7 +165,7 @@ void *coproc_loop(void *arg)
     ctx->task_done_queue   = new FixedRing<OffloadTask *, nullptr>(256, ctx->loc.node_id);
 
     /* Initialize the event loop. */
-    ctx->loop = ev_loop_new(EVFLAG_AUTO);
+    ctx->loop = ev_loop_new(EVFLAG_AUTO | EVFLAG_NOSIGMASK);
     ctx->loop_broken = false;
     ev_set_userdata(ctx->loop, ctx);
 

diff --git a/src/lib/datablock.cc b/src/lib/datablock.cc
@@ -181,8 +181,8 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) {
     case READ_PARTIAL_PACKET: {
         void *invalid_value = this->get_invalid_value();
         FOR_EACH_PACKET_ALL_PREFETCH(batch, 4u) {
-            size_t aligned_elemsz = t->aligned_item_sizes_h->size;
-            size_t offset         = t->aligned_item_sizes_h->size * pkt_idx;
+            uint16_t aligned_elemsz = t->aligned_item_sizes_h->size;
+            uint32_t offset         = t->aligned_item_sizes_h->size * pkt_idx;
             if (IS_PACKET_INVALID(batch, pkt_idx)) {
                 if (invalid_value != nullptr) {
                     rte_memcpy((char *) host_in_buffer + offset, invalid_value, aligned_elemsz);