diff --git a/3rdparty/libev/ev.c b/3rdparty/libev/ev.c
index cf55835..3ad0e9b 100644
--- a/3rdparty/libev/ev.c
+++ b/3rdparty/libev/ev.c
@@ -3604,7 +3604,8 @@ ev_run (EV_P_ int flags)
 
         ECB_MEMORY_FENCE; /* make sure pipe_write_wanted is visible before we check for potential skips */
 
-        if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped)))
+        //if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped)))
+        if (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped))
           {
             waittime = MAX_BLOCKTIME;
 
diff --git a/elements/ip/IPlookup_kernel.cu b/elements/ip/IPlookup_kernel.cu
index dddd269..6e06333 100644
--- a/elements/ip/IPlookup_kernel.cu
+++ b/elements/ip/IPlookup_kernel.cu
@@ -32,7 +32,7 @@ __device__ uint32_t ntohl(uint32_t n)
 
 /* The GPU kernel. */
 __global__ void ipv4_route_lookup_cuda(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         uint16_t* __restrict__ TBL24_d,
@@ -43,10 +43,10 @@ __global__ void ipv4_route_lookup_cuda(
     if (idx < count) {
         uint16_t batch_idx = batch_ids[idx];
         uint16_t item_idx  = item_ids[idx];
-        struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv4_dest_addrs_d];
-        struct datablock_kernel_arg *db_results    = &datablocks[dbid_ipv4_lookup_results_d];
-        uint32_t daddr = ((uint32_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx];
-        uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx];
+        struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d];
+        struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv4_lookup_results_d];
+        uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
+        uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases_out)[item_idx];
 
         if (daddr == IGNORED_IP) {
             *lookup_result = 0;
diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu
index 9be6f10..5704f0a 100644
--- a/elements/ipsec/IPsecAES_kernel.cu
+++ b/elements/ipsec/IPsecAES_kernel.cu
@@ -680,7 +680,7 @@ __device__ void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out,
 }
 
 __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         struct aes_sa_entry* flow_info
@@ -698,25 +698,25 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
     const uint16_t batch_idx = batch_ids[idx];
     const uint16_t item_idx  = item_ids[idx];
 
-    const struct datablock_kernel_arg *db_enc_payloads    = &datablocks[dbid_enc_payloads_d];
-    const struct datablock_kernel_arg *db_iv              = &datablocks[dbid_iv_d];
-    const struct datablock_kernel_arg *db_flow_ids        = &datablocks[dbid_flow_ids_d];
-    const struct datablock_kernel_arg *db_aes_block_info  = &datablocks[dbid_aes_block_info_d];
+    const struct datablock_kernel_arg *db_enc_payloads    = datablocks[dbid_enc_payloads_d];
+    const struct datablock_kernel_arg *db_iv              = datablocks[dbid_iv_d];
+    const struct datablock_kernel_arg *db_flow_ids        = datablocks[dbid_flow_ids_d];
+    const struct datablock_kernel_arg *db_aes_block_info  = datablocks[dbid_aes_block_info_d];
 
     assert(batch_idx < 32);
-    assert(item_idx < db_aes_block_info->item_count_in[batch_idx]);
+    assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in);
 
     uint64_t flow_id = 65536;
     const struct aes_block_info cur_block_info = ((struct aes_block_info *)
-                                                   db_aes_block_info->buffer_bases_in[batch_idx])
+                                                   db_aes_block_info->batches[batch_idx].buffer_bases_in)
                                                   [item_idx];
     const int pkt_idx         = cur_block_info.pkt_idx;
     const int block_idx_local = cur_block_info.block_idx;
-    const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][pkt_idx];
-    const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][pkt_idx];
+    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx];
+    const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];
 
     if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) {
-        flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[pkt_idx];
+        flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx];
         if (flow_id != 65536)
             assert(flow_id < 1024);
     }
@@ -730,11 +730,11 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
     if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64) {
 
         aes_key = flow_info[flow_id].aes_key;
-        iv = ((uint4 *) db_iv->buffer_bases_in[batch_idx])[pkt_idx];
+        iv = ((uint4 *) db_iv->batches[batch_idx].buffer_bases_in)[pkt_idx];
 
         if (offset != 0 && length != 0) {
 
-            enc_payload = ((uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx]) + offset;
+            enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset;
 
             /* Step 2. (marginal) */
             for (int i = 0; i * blockDim.x < 256; i++) {
@@ -773,7 +773,8 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
 
         /* Step 5: XOR the plain text (in-place). */
         uint4 *in_blk = (uint4 *) &enc_payload[block_idx_local * AES_BLOCK_SIZE];
-        assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= enc_payload + db_enc_payloads->item_sizes_in[batch_idx][pkt_idx]);
+        assert((uint8_t*)in_blk + AES_BLOCK_SIZE <=
+               enc_payload + db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]);
         (*in_blk).x = ecounter.x ^ (*in_blk).x;
         (*in_blk).y = ecounter.y ^ (*in_blk).y;
         (*in_blk).z = ecounter.z ^ (*in_blk).z;
diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
index 93b08d1..d0d0e81 100644
--- a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
+++ b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
@@ -1238,7 +1238,7 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets,
 #endif
 
 __global__ void computeHMAC_SHA1_3(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         struct hmac_sa_entry *hmac_key_array)
@@ -1248,14 +1248,14 @@ __global__ void computeHMAC_SHA1_3(
         const uint16_t batch_idx = batch_ids[idx];
         const uint16_t item_idx  = item_ids[idx];
         assert(item_idx < 64);
-        const struct datablock_kernel_arg *db_enc_payloads = &datablocks[dbid_enc_payloads_d];
-        const struct datablock_kernel_arg *db_flow_ids     = &datablocks[dbid_flow_ids_d];
+        const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
+        const struct datablock_kernel_arg *db_flow_ids     = datablocks[dbid_flow_ids_d];
 
-        const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx];
-        const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][item_idx];
-        const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][item_idx];
+        const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
+        const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx];
+        const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
         if (enc_payload_base != NULL && offset != 0 && length != 0) {
-            const uint64_t flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[item_idx];
+            const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
             if (flow_id != 65536 && flow_id < 1024) {
                 //assert(flow_id < 1024);
                 const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key;
diff --git a/elements/ipv6/LookupIP6Route_kernel.cu b/elements/ipv6/LookupIP6Route_kernel.cu
index bc7fa00..098932c 100644
--- a/elements/ipv6/LookupIP6Route_kernel.cu
+++ b/elements/ipv6/LookupIP6Route_kernel.cu
@@ -143,7 +143,7 @@ __device__ uint64_t ntohll(uint64_t val)
 }
 
 __global__ void ipv6_route_lookup_cuda(
-        struct datablock_kernel_arg *datablocks,
+        struct datablock_kernel_arg **datablocks,
         uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
         uint8_t *checkbits_d,
         Item** __restrict__ tables_d,
@@ -154,10 +154,10 @@ __global__ void ipv6_route_lookup_cuda(
     if (idx < count) {
         uint16_t batch_idx = batch_ids[idx];
         uint16_t item_idx  = item_ids[idx];
-        struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv6_dest_addrs_d];
-        struct datablock_kernel_arg *db_results    = &datablocks[dbid_ipv6_lookup_results_d];
-        struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx];
-        uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx];
+        struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d];
+        struct datablock_kernel_arg *db_results    = datablocks[dbid_ipv6_lookup_results_d];
+        struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
+        uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases_out)[item_idx];
 
         // NOTE: On FERMI devices, using shared memory to store just 128
         //   pointers is not necessary since they have on-chip L1
diff --git a/include/nba/engines/cuda/compat.hh b/include/nba/engines/cuda/compat.hh
index 560cb71..3260658 100644
--- a/include/nba/engines/cuda/compat.hh
+++ b/include/nba/engines/cuda/compat.hh
@@ -9,23 +9,23 @@
 
 #include <nba/framework/config.hh>
 
+struct datablock_batch_info {
+    void *buffer_bases_in;
+    void *buffer_bases_out;
+    uint32_t item_count_in;
+    uint32_t item_count_out;
+    uint16_t *item_sizes_in;
+    uint16_t *item_sizes_out;
+    uint32_t *item_offsets_in;
+    uint32_t *item_offsets_out;
+}; // __cuda_aligned
+
 struct datablock_kernel_arg {
     uint32_t total_item_count_in;
     uint32_t total_item_count_out;
-    void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH];
-    void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
-    union {
-        uint16_t item_size_in;
-        uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
-    };
-    union {
-        uint16_t item_size_out;
-        uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
-    };
-    uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
-    uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
-};
+    uint16_t item_size_in;  // for fixed-size cases
+    uint16_t item_size_out; // for fixed-size cases
+    struct datablock_batch_info batches[0];
+}; // __cuda_aligned
 
 #endif
diff --git a/include/nba/framework/config.hh b/include/nba/framework/config.hh
index 9f9f9ed..f64b0c3 100644
--- a/include/nba/framework/config.hh
+++ b/include/nba/framework/config.hh
@@ -47,7 +47,7 @@
 
 #define NBA_MAX_TASKPOOL_SIZE       (2048u)
 #define NBA_MAX_BATCHPOOL_SIZE      (2048u)
-#define NBA_MAX_IO_BASES            (17)
+#define NBA_MAX_IO_BASES            (1)
 
 #define NBA_MAX_ANNOTATION_SET_SIZE         (7)
 #define NBA_MAX_NODELOCALSTORAGE_ENTRIES    (16)
diff --git a/include/nba/framework/datablock.hh b/include/nba/framework/datablock.hh
index 13fbded..d554594 100644
--- a/include/nba/framework/datablock.hh
+++ b/include/nba/framework/datablock.hh
@@ -79,7 +79,7 @@ struct item_size_info {
         uint16_t size;
         uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12];
     };
-    uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
+    uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
 };
 #else
 struct item_size_info {
@@ -87,14 +87,16 @@ struct item_size_info {
         uint16_t size;
         uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96];
     };
-    uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
+    uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
 };
 #endif
 
-/** Datablock tracking struct.
+/**
+ * Datablock tracking struct.
  *
- * It resides in PacketBatch as a static array, and keeps track of the
- * status of data blocks attached to the batch.
+ * It contains information required to reuse device buffers that are
+ * already copied to the device.  It resides in PacketBatch as a static
+ * array.
  */
 struct datablock_tracker {
     void *host_in_ptr;
@@ -105,30 +107,38 @@ struct datablock_tracker {
     size_t in_count;
     size_t out_size;
     size_t out_count;
-    //struct item_size_info exact_item_sizes;
     struct item_size_info *aligned_item_sizes_h;
     memory_t aligned_item_sizes_d;
 };
 
-/* NOTE: The alignment of this struct should match with CUDA. */
+/**
+ * Datablock batch info struct.
+ *
+ * It contains item offset/size information for variable-length datablocks.
+ *
+ * NOTE: The alignment of this struct should match with CUDA.
+ */
+struct datablock_batch_info {
+    void *buffer_bases_in;
+    void *buffer_bases_out;
+    uint32_t item_count_in;
+    uint32_t item_count_out;
+    uint16_t *item_sizes_in;
+    uint16_t *item_sizes_out;
+    uint32_t *item_offsets_in;
+    uint32_t *item_offsets_out;
+}; // __cuda_aligned
+
+/**
+ * Kernel argument spec for datablocks.
+ */
 struct datablock_kernel_arg {
     uint32_t total_item_count_in;
     uint32_t total_item_count_out;
-    void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH];
-    void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
-    uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
-    union {
-        uint16_t item_size_in;
-        uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
-    };
-    union {
-        uint16_t item_size_out;
-        uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
-    };
-    uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
-    uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
-}; // __attribute__((aligned(8)));
+    uint16_t item_size_in;  // for fixed-size cases
+    uint16_t item_size_out; // for fixed-size cases
+    struct datablock_batch_info batches[0];
+}; // __cuda_aligned
 
 
 /** Datablock information class.
@@ -165,7 +175,7 @@ public:
     void preprocess(PacketBatch *batch, void *host_ptr);
     void postprocess(OffloadableElement *elem, int input_port, PacketBatch *batch, void *host_ptr);
 
-    /* Below methods arre used only when ROI type is USER_PREPROC/USER_POSTPROC. */
+    /* Below methods are used only when ROI type is USER_PREPROC/USER_POSTPROC. */
     virtual void calculate_read_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count)
     { out_bytes = 0; out_count = 0; }
     virtual void calculate_write_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count)
diff --git a/include/nba/framework/elementgraph.hh b/include/nba/framework/elementgraph.hh
index 0be5a4c..ae06774 100644
--- a/include/nba/framework/elementgraph.hh
+++ b/include/nba/framework/elementgraph.hh
@@ -46,6 +46,7 @@ public:
     /* Inserts the given batch/offloadtask to the internal task queue.
      * This does not execute the pipeline; call flush_tasks() for that. */
     void enqueue_batch(PacketBatch *batch, Element *start_elem, int input_port = 0);
+    void enqueue_offload_task(OffloadTask *otask, OffloadableElement *start_elem, int input_port = 0);
     void enqueue_offload_task(OffloadTask *otask, Element *start_elem, int input_port = 0);
 
     /* Tries to run all pending computation tasks. */
diff --git a/include/nba/framework/offloadtask.hh b/include/nba/framework/offloadtask.hh
index d2734b2..8f9d00a 100644
--- a/include/nba/framework/offloadtask.hh
+++ b/include/nba/framework/offloadtask.hh
@@ -58,8 +58,7 @@ public:
     void notify_completion();
 
 public:
-    /* Initialized during execute(). */
-    struct resource_param res;
+    struct resource_param res; /* Initialized during execute(). */
     uint64_t offload_start;
     double offload_cost;
     size_t num_pkts;
@@ -80,7 +79,7 @@ public:
     OffloadableElement* elem;
     int dbid_h2d[NBA_MAX_DATABLOCKS];
 
-    struct datablock_kernel_arg *dbarray_h;
+    struct datablock_kernel_arg **dbarray_h;
     memory_t dbarray_d;
 
     struct ev_async *completion_watcher __cache_aligned;
diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc
index 75fe6d1..0aa1ec0 100644
--- a/src/engines/cuda/computecontext.cc
+++ b/src/engines/cuda/computecontext.cc
@@ -12,7 +12,7 @@ struct cuda_event_context {
     void *user_arg;
 };
 
-#define IO_BASE_SIZE (4 * 1024 * 1024)
+#define IO_BASE_SIZE (16 * 1024 * 1024)
 #undef USE_PHYS_CONT_MEMORY // performance degraded :(
 
 CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother)
diff --git a/src/lib/coprocessor.cc b/src/lib/coprocessor.cc
index 295b0ea..9c35792 100644
--- a/src/lib/coprocessor.cc
+++ b/src/lib/coprocessor.cc
@@ -165,7 +165,7 @@ void *coproc_loop(void *arg)
     ctx->task_done_queue   = new FixedRing<OffloadTask *, nullptr>(256, ctx->loc.node_id);
 
     /* Initialize the event loop. */
-    ctx->loop = ev_loop_new(EVFLAG_AUTO);
+    ctx->loop = ev_loop_new(EVFLAG_AUTO | EVFLAG_NOSIGMASK);
     ctx->loop_broken = false;
     ev_set_userdata(ctx->loop, ctx);
 
diff --git a/src/lib/datablock.cc b/src/lib/datablock.cc
index 515c29b..d789545 100644
--- a/src/lib/datablock.cc
+++ b/src/lib/datablock.cc
@@ -181,8 +181,8 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) {
     case READ_PARTIAL_PACKET: {
         void *invalid_value = this->get_invalid_value();
         FOR_EACH_PACKET_ALL_PREFETCH(batch, 4u) {
-            size_t aligned_elemsz = t->aligned_item_sizes_h->size;
-            size_t offset         = t->aligned_item_sizes_h->size * pkt_idx;
+            uint16_t aligned_elemsz = t->aligned_item_sizes_h->size;
+            uint32_t offset         = t->aligned_item_sizes_h->size * pkt_idx;
             if (IS_PACKET_INVALID(batch, pkt_idx)) {
                 if (invalid_value != nullptr) {
                     rte_memcpy((char *) host_in_buffer + offset, invalid_value, aligned_elemsz);
diff --git a/src/lib/elementgraph.cc b/src/lib/elementgraph.cc
index efb938f..a132dcd 100644
--- a/src/lib/elementgraph.cc
+++ b/src/lib/elementgraph.cc
@@ -42,7 +42,8 @@ ElementGraph::ElementGraph(comp_thread_context *ctx)
 #ifdef NBA_REUSE_DATABLOCKS
     struct rte_hash_parameters hparams;
     char namebuf[RTE_HASH_NAMESIZE];
-    sprintf(namebuf, "elemgraph@%u.%u:offl_actions", ctx->loc.node_id, ctx->loc.local_thread_idx);
+    snprintf(namebuf, RTE_HASH_NAMESIZE, "elemgraph@%u.%u:offl_actions",
+             ctx->loc.node_id, ctx->loc.local_thread_idx);
     hparams.name = namebuf;
     hparams.entries = 64;
     hparams.key_len = sizeof(struct offload_action_key);
@@ -97,12 +98,16 @@ void ElementGraph::send_offload_task_to_device(OffloadTask *task)
                 bidx ++;
             }
         }
-        task->offload_start = 0;
 
         /* Allocate the host-device IO buffer pool. */
-        if (task->io_base == INVALID_IO_BASE)
+        while (task->io_base == INVALID_IO_BASE) {
             task->io_base = cctx->alloc_io_base();
-        assert(task->io_base != INVALID_IO_BASE);
+            if (task->io_base == INVALID_IO_BASE) {
+                /* If not available now, wait. */
+                ev_run(ctx->io_ctx->loop, 0);
+            }
+            if (unlikely(ctx->io_ctx->loop_broken)) return;
+        }
 
         /* Calculate required buffer sizes, allocate them, and initialize them.
          * The mother buffer is statically allocated on start-up and here we
@@ -214,6 +219,15 @@ void ElementGraph::enqueue_batch(PacketBatch *batch, Element *start_elem, int in
     queue.push_back(Task::to_task(batch));
 }
 
+void ElementGraph::enqueue_offload_task(OffloadTask *otask, OffloadableElement *start_elem, int input_port)
+{
+    assert(start_elem != nullptr);
+    otask->elem = start_elem;
+    otask->tracker.element = start_elem;
+    otask->tracker.input_port = input_port;
+    queue.push_back(Task::to_task(otask));
+}
+
 void ElementGraph::enqueue_offload_task(OffloadTask *otask, Element *start_elem, int input_port)
 {
     assert(start_elem != nullptr);
diff --git a/src/lib/io.cc b/src/lib/io.cc
index 233138c..a3d9098 100644
--- a/src/lib/io.cc
+++ b/src/lib/io.cc
@@ -134,6 +134,22 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn
         /* Run postprocessing handlers. */
         task->postprocess();
 
+        if (ctx->elem_graph->check_postproc_all(task->elem)) {
+            /* Reset all datablock trackers. */
+            for (PacketBatch *batch : task->batches) {
+                if (batch->datablock_states != nullptr) {
+                    struct datablock_tracker *t = batch->datablock_states;
+                    t->host_in_ptr = nullptr;
+                    t->host_out_ptr = nullptr;
+                    rte_mempool_put(ctx->dbstate_pool, (void *) t);
+                    batch->datablock_states = nullptr;
+                }
+            }
+            /* Release per-task io_base. */
+            task->cctx->clear_io_buffers(task->io_base);
+            ev_break(ctx->io_ctx->loop, EVBREAK_ALL);
+        }
+
         /* Update statistics. */
         uint64_t task_cycles = now - task->offload_start;
         float time_spent = (float) task_cycles / rte_get_tsc_hz();
diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc
index 4ddaf04..9aa2f96 100644
--- a/src/lib/offloadtask.cc
+++ b/src/lib/offloadtask.cc
@@ -189,27 +189,34 @@ void OffloadTask::prepare_write_buffer()
 
 bool OffloadTask::copy_h2d()
 {
-    bool has_h2d_copies = false;
     state = TASK_H2D_COPYING;
 
     /* Copy the datablock information for the first kernel argument. */
-    size_t dbarray_size = ALIGN_CEIL(sizeof(struct datablock_kernel_arg) * datablocks.size(), CACHE_LINE_SIZE);
+    size_t dbarray_size = sizeof(struct datablock_kernel_arg *) * datablocks.size();
     cctx->alloc_input_buffer(io_base, dbarray_size, (void **) &dbarray_h, &dbarray_d);
     _debug_print_inb("copy_h2d.dbarray", nullptr, 0);
     assert(dbarray_h != nullptr);
 
     for (int dbid : datablocks) {
         int dbid_d = dbid_h2d[dbid];
-        dbarray_h[dbid_d].total_item_count_in  = 0;
-        dbarray_h[dbid_d].total_item_count_out = 0;
         assert(dbid_d < (signed) datablocks.size());
-
         DataBlock *db = comp_ctx->datablock_registry[dbid];
         struct read_roi_info rri;
         struct write_roi_info wri;
         db->get_read_roi(&rri);
         db->get_write_roi(&wri);
 
+        struct datablock_kernel_arg *dbarg_h;
+        memory_t dbarg_d;
+        size_t dbarg_size = sizeof(struct datablock_kernel_arg)
+                            + batches.size() * sizeof(struct datablock_batch_info);
+        cctx->alloc_input_buffer(io_base, dbarg_size, (void **) &dbarg_h, &dbarg_d);
+        assert(dbarg_h != nullptr);
+
+        dbarray_h[dbid_d] = (struct datablock_kernel_arg *) dbarg_d.ptr;
+        dbarg_h->total_item_count_in  = 0;
+        dbarg_h->total_item_count_out = 0;
+
         int b = 0;
         for (PacketBatch *batch : batches) {
             assert(batch->datablock_states != nullptr);
@@ -219,34 +226,37 @@ bool OffloadTask::copy_h2d()
                 /* We need to copy the size array because each item may
                  * have different lengths. */
                 assert(t->aligned_item_sizes_h != nullptr);
-                dbarray_h[dbid_d].item_sizes_in[b]  = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr
-                                                                    + (uintptr_t) offsetof(struct item_size_info, sizes));
-                dbarray_h[dbid_d].item_sizes_out[b] = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr
-                                                                    + (uintptr_t) offsetof(struct item_size_info, sizes));
-                dbarray_h[dbid_d].item_offsets_in[b]  = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr
-                                                                      + (uintptr_t) offsetof(struct item_size_info, offsets));
-                dbarray_h[dbid_d].item_offsets_out[b] = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr
-                                                                      + (uintptr_t) offsetof(struct item_size_info, offsets));
+                dbarg_h->batches[b].item_sizes_in  = (uint16_t *)
+                        ((char *) t->aligned_item_sizes_d.ptr
+                         + (uintptr_t) offsetof(struct item_size_info, sizes));
+                dbarg_h->batches[b].item_sizes_out = (uint16_t *)
+                        ((char *) t->aligned_item_sizes_d.ptr
+                         + (uintptr_t) offsetof(struct item_size_info, sizes));
+                dbarg_h->batches[b].item_offsets_in = (uint32_t *)
+                        ((char *) t->aligned_item_sizes_d.ptr
+                         + (uintptr_t) offsetof(struct item_size_info, offsets));
+                dbarg_h->batches[b].item_offsets_out = (uint32_t *)
+                        ((char *) t->aligned_item_sizes_d.ptr
+                         + (uintptr_t) offsetof(struct item_size_info, offsets));
             } else {
                 /* Same for all batches.
                  * We assume the module developer knows the fixed length
                  * when writing device kernel codes. */
-                dbarray_h[dbid_d].item_size_in  = rri.length;
-                dbarray_h[dbid_d].item_size_out = wri.length;
-                dbarray_h[dbid_d].item_offsets_in[b]  = nullptr;
-                dbarray_h[dbid_d].item_offsets_out[b] = nullptr;
+                dbarg_h->item_size_in  = rri.length;
+                dbarg_h->item_size_out = wri.length;
+                dbarg_h->batches[b].item_offsets_in  = nullptr;
+                dbarg_h->batches[b].item_offsets_out = nullptr;
             }
-            dbarray_h[dbid_d].buffer_bases_in[b]   = t->dev_in_ptr.ptr;   // FIXME: generalize to CL?
-            dbarray_h[dbid_d].item_count_in[b]     = t->in_count;
-            dbarray_h[dbid_d].total_item_count_in += t->in_count;
-            dbarray_h[dbid_d].buffer_bases_out[b]   = t->dev_out_ptr.ptr; // FIXME: generalize to CL?
-            dbarray_h[dbid_d].item_count_out[b]     = t->out_count;
-            dbarray_h[dbid_d].total_item_count_out += t->out_count;
+            dbarg_h->batches[b].buffer_bases_in = t->dev_in_ptr.ptr;   // FIXME: generalize to CL?
+            dbarg_h->batches[b].item_count_in   = t->in_count;
+            dbarg_h->total_item_count_in       += t->in_count;
+            dbarg_h->batches[b].buffer_bases_out = t->dev_out_ptr.ptr; // FIXME: generalize to CL?
+            dbarg_h->batches[b].item_count_out   = t->out_count;
+            dbarg_h->total_item_count_out       += t->out_count;
             b++;
         } /* endfor(batches) */
     } /* endfor(dbid) */
-    has_h2d_copies = true;
-    return has_h2d_copies;
+    return true;
 }
 
 /**
@@ -303,7 +313,13 @@ void OffloadTask::execute()
         }
 
         size_t last_alloc_size = cctx->get_input_size(io_base);
-        cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0, last_alloc_size - input_alloc_size_begin);
+        //printf("GPU-offload-h2d-size: %'lu bytes\n", last_alloc_size);
+        // ipv4@64B: 16K ~ 24K
+        // ipsec@64B: ~ 5M
+        cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0,
+                                  last_alloc_size - input_alloc_size_begin);
+        //cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0,
+        //                          2097152);
 
         cctx->clear_checkbits();
         cctx->clear_kernel_args();
@@ -403,22 +419,6 @@ void OffloadTask::postprocess()
             }
         } /* endif(check_postproc) */
     } /* endfor(dbid) */
-
-    if (elemgraph->check_postproc_all(elem)) {
-        /* Reset all datablock trackers. */
-        for (PacketBatch *batch : batches) {
-            if (batch->datablock_states != nullptr) {
-                struct datablock_tracker *t = batch->datablock_states;
-                t->host_in_ptr = nullptr;
-                t->host_out_ptr = nullptr;
-                rte_mempool_put(comp_ctx->dbstate_pool, (void *) t);
-                batch->datablock_states = nullptr;
-            }
-        }
-        /* Release per-task io_base. */
-        cctx->clear_io_buffers(io_base);
-        //printf("%s task finished\n", elem->class_name());
-    }
 }
 
 // vim: ts=8 sts=4 sw=4 et