diff --git a/3rdparty/libev/ev.c b/3rdparty/libev/ev.c index cf55835..3ad0e9b 100644 --- a/3rdparty/libev/ev.c +++ b/3rdparty/libev/ev.c @@ -3604,7 +3604,8 @@ ev_run (EV_P_ int flags) ECB_MEMORY_FENCE; /* make sure pipe_write_wanted is visible before we check for potential skips */ - if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped))) + //if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped))) + if (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped)) { waittime = MAX_BLOCKTIME; diff --git a/elements/ip/IPlookup_kernel.cu b/elements/ip/IPlookup_kernel.cu index dddd269..6e06333 100644 --- a/elements/ip/IPlookup_kernel.cu +++ b/elements/ip/IPlookup_kernel.cu @@ -32,7 +32,7 @@ __device__ uint32_t ntohl(uint32_t n) /* The GPU kernel. */ __global__ void ipv4_route_lookup_cuda( - struct datablock_kernel_arg *datablocks, + struct datablock_kernel_arg **datablocks, uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, uint16_t* __restrict__ TBL24_d, @@ -43,10 +43,10 @@ __global__ void ipv4_route_lookup_cuda( if (idx < count) { uint16_t batch_idx = batch_ids[idx]; uint16_t item_idx = item_ids[idx]; - struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv4_dest_addrs_d]; - struct datablock_kernel_arg *db_results = &datablocks[dbid_ipv4_lookup_results_d]; - uint32_t daddr = ((uint32_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx]; - uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx]; + struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d]; + struct datablock_kernel_arg *db_results = datablocks[dbid_ipv4_lookup_results_d]; + uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx]; + uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases_out)[item_idx]; if (daddr == IGNORED_IP) { *lookup_result = 0; diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu index 9be6f10..5704f0a 100644 --- a/elements/ipsec/IPsecAES_kernel.cu +++ b/elements/ipsec/IPsecAES_kernel.cu @@ -680,7 +680,7 @@ __device__ void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out, } __global__ void AES_ctr_encrypt_chunk_SharedMem_5( - struct datablock_kernel_arg *datablocks, + struct datablock_kernel_arg **datablocks, uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, struct aes_sa_entry* flow_info @@ -698,25 +698,25 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( const uint16_t batch_idx = batch_ids[idx]; const uint16_t item_idx = item_ids[idx]; - const struct datablock_kernel_arg *db_enc_payloads = &datablocks[dbid_enc_payloads_d]; - const struct datablock_kernel_arg *db_iv = &datablocks[dbid_iv_d]; - const struct datablock_kernel_arg *db_flow_ids = &datablocks[dbid_flow_ids_d]; - const struct datablock_kernel_arg *db_aes_block_info = &datablocks[dbid_aes_block_info_d]; + const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d]; + const struct datablock_kernel_arg *db_iv = datablocks[dbid_iv_d]; + const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d]; + const struct datablock_kernel_arg *db_aes_block_info = datablocks[dbid_aes_block_info_d]; assert(batch_idx < 32); - assert(item_idx < db_aes_block_info->item_count_in[batch_idx]); + assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in); uint64_t flow_id = 65536; const struct aes_block_info cur_block_info = ((struct aes_block_info *) - db_aes_block_info->buffer_bases_in[batch_idx]) + db_aes_block_info->batches[batch_idx].buffer_bases_in) [item_idx]; const int pkt_idx = cur_block_info.pkt_idx; const int block_idx_local = cur_block_info.block_idx; - const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][pkt_idx]; - const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][pkt_idx]; + const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx]; + const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]; if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) { - flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[pkt_idx]; + flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx]; if (flow_id != 65536) assert(flow_id < 1024); } @@ -730,11 +730,11 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64) { aes_key = flow_info[flow_id].aes_key; - iv = ((uint4 *) db_iv->buffer_bases_in[batch_idx])[pkt_idx]; + iv = ((uint4 *) db_iv->batches[batch_idx].buffer_bases_in)[pkt_idx]; if (offset != 0 && length != 0) { - enc_payload = ((uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx]) + offset; + enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset; /* Step 2. (marginal) */ for (int i = 0; i * blockDim.x < 256; i++) { @@ -773,7 +773,8 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( /* Step 5: XOR the plain text (in-place). */ uint4 *in_blk = (uint4 *) &enc_payload[block_idx_local * AES_BLOCK_SIZE]; - assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= enc_payload + db_enc_payloads->item_sizes_in[batch_idx][pkt_idx]); + assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= + enc_payload + db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]); (*in_blk).x = ecounter.x ^ (*in_blk).x; (*in_blk).y = ecounter.y ^ (*in_blk).y; (*in_blk).z = ecounter.z ^ (*in_blk).z; diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu index 93b08d1..d0d0e81 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu +++ b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu @@ -1238,7 +1238,7 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets, #endif __global__ void computeHMAC_SHA1_3( - struct datablock_kernel_arg *datablocks, + struct datablock_kernel_arg **datablocks, uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, struct hmac_sa_entry *hmac_key_array) @@ -1248,14 +1248,14 @@ __global__ void computeHMAC_SHA1_3( const uint16_t batch_idx = batch_ids[idx]; const uint16_t item_idx = item_ids[idx]; assert(item_idx < 64); - const struct datablock_kernel_arg *db_enc_payloads = &datablocks[dbid_enc_payloads_d]; - const struct datablock_kernel_arg *db_flow_ids = &datablocks[dbid_flow_ids_d]; + const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d]; + const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d]; - const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx]; - const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][item_idx]; - const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][item_idx]; + const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in; + const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx]; + const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx]; if (enc_payload_base != NULL && offset != 0 && length != 0) { - const uint64_t flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[item_idx]; + const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx]; if (flow_id != 65536 && flow_id < 1024) { //assert(flow_id < 1024); const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key; diff --git a/elements/ipv6/LookupIP6Route_kernel.cu b/elements/ipv6/LookupIP6Route_kernel.cu index bc7fa00..098932c 100644 --- a/elements/ipv6/LookupIP6Route_kernel.cu +++ b/elements/ipv6/LookupIP6Route_kernel.cu @@ -143,7 +143,7 @@ __device__ uint64_t ntohll(uint64_t val) } __global__ void ipv6_route_lookup_cuda( - struct datablock_kernel_arg *datablocks, + struct datablock_kernel_arg **datablocks, uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, Item** __restrict__ tables_d, @@ -154,10 +154,10 @@ __global__ void ipv6_route_lookup_cuda( if (idx < count) { uint16_t batch_idx = batch_ids[idx]; uint16_t item_idx = item_ids[idx]; - struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv6_dest_addrs_d]; - struct datablock_kernel_arg *db_results = &datablocks[dbid_ipv6_lookup_results_d]; - struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx]; - uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx]; + struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d]; + struct datablock_kernel_arg *db_results = datablocks[dbid_ipv6_lookup_results_d]; + struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx]; + uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases_out)[item_idx]; // NOTE: On FERMI devices, using shared memory to store just 128 // pointers is not necessary since they have on-chip L1 diff --git a/include/nba/engines/cuda/compat.hh b/include/nba/engines/cuda/compat.hh index 560cb71..3260658 100644 --- a/include/nba/engines/cuda/compat.hh +++ b/include/nba/engines/cuda/compat.hh @@ -9,23 +9,23 @@ #include +struct datablock_batch_info { + void *buffer_bases_in; + void *buffer_bases_out; + uint32_t item_count_in; + uint32_t item_count_out; + uint16_t *item_sizes_in; + uint16_t *item_sizes_out; + uint32_t *item_offsets_in; + uint32_t *item_offsets_out; +}; // __cuda_aligned + struct datablock_kernel_arg { uint32_t total_item_count_in; uint32_t total_item_count_out; - void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH]; - void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH]; - uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH]; - uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH]; - union { - uint16_t item_size_in; - uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH]; - }; - union { - uint16_t item_size_out; - uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH]; - }; - uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH]; - uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH]; -}; + uint16_t item_size_in; // for fixed-size cases + uint16_t item_size_out; // for fixed-size cases + struct datablock_batch_info batches[0]; +}; // __cuda_aligned #endif diff --git a/include/nba/framework/config.hh b/include/nba/framework/config.hh index 9f9f9ed..f64b0c3 100644 --- a/include/nba/framework/config.hh +++ b/include/nba/framework/config.hh @@ -47,7 +47,7 @@ #define NBA_MAX_TASKPOOL_SIZE (2048u) #define NBA_MAX_BATCHPOOL_SIZE (2048u) -#define NBA_MAX_IO_BASES (17) +#define NBA_MAX_IO_BASES (1) #define NBA_MAX_ANNOTATION_SET_SIZE (7) #define NBA_MAX_NODELOCALSTORAGE_ENTRIES (16) diff --git a/include/nba/framework/datablock.hh b/include/nba/framework/datablock.hh index 13fbded..d554594 100644 --- a/include/nba/framework/datablock.hh +++ b/include/nba/framework/datablock.hh @@ -79,7 +79,7 @@ struct item_size_info { uint16_t size; uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12]; }; - uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12]; + uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12]; }; #else struct item_size_info { @@ -87,14 +87,16 @@ struct item_size_info { uint16_t size; uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96]; }; - uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96]; + uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96]; }; #endif -/** Datablock tracking struct. +/** + * Datablock tracking struct. * - * It resides in PacketBatch as a static array, and keeps track of the - * status of data blocks attached to the batch. + * It contains information required to reuse device buffers that are + * already copied to the device. It resides in PacketBatch as a static + * array. */ struct datablock_tracker { void *host_in_ptr; @@ -105,30 +107,38 @@ struct datablock_tracker { size_t in_count; size_t out_size; size_t out_count; - //struct item_size_info exact_item_sizes; struct item_size_info *aligned_item_sizes_h; memory_t aligned_item_sizes_d; }; -/* NOTE: The alignment of this struct should match with CUDA. */ +/** + * Datablock batch info struct. + * + * It contains item offset/size information for variable-length datablocks. + * + * NOTE: The alignment of this struct should match with CUDA. + */ +struct datablock_batch_info { + void *buffer_bases_in; + void *buffer_bases_out; + uint32_t item_count_in; + uint32_t item_count_out; + uint16_t *item_sizes_in; + uint16_t *item_sizes_out; + uint32_t *item_offsets_in; + uint32_t *item_offsets_out; +}; // __cuda_aligned + +/** + * Kernel argument spec for datablocks. + */ struct datablock_kernel_arg { uint32_t total_item_count_in; uint32_t total_item_count_out; - void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH]; - void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH]; - uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH]; - uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH]; - union { - uint16_t item_size_in; - uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH]; - }; - union { - uint16_t item_size_out; - uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH]; - }; - uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH]; - uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH]; -}; // __attribute__((aligned(8))); + uint16_t item_size_in; // for fixed-size cases + uint16_t item_size_out; // for fixed-size cases + struct datablock_batch_info batches[0]; +}; // __cuda_aligned /** Datablock information class. @@ -165,7 +175,7 @@ public: void preprocess(PacketBatch *batch, void *host_ptr); void postprocess(OffloadableElement *elem, int input_port, PacketBatch *batch, void *host_ptr); - /* Below methods arre used only when ROI type is USER_PREPROC/USER_POSTPROC. */ + /* Below methods are used only when ROI type is USER_PREPROC/USER_POSTPROC. */ virtual void calculate_read_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count) { out_bytes = 0; out_count = 0; } virtual void calculate_write_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count) diff --git a/include/nba/framework/elementgraph.hh b/include/nba/framework/elementgraph.hh index 0be5a4c..ae06774 100644 --- a/include/nba/framework/elementgraph.hh +++ b/include/nba/framework/elementgraph.hh @@ -46,6 +46,7 @@ public: /* Inserts the given batch/offloadtask to the internal task queue. * This does not execute the pipeline; call flush_tasks() for that. */ void enqueue_batch(PacketBatch *batch, Element *start_elem, int input_port = 0); + void enqueue_offload_task(OffloadTask *otask, OffloadableElement *start_elem, int input_port = 0); void enqueue_offload_task(OffloadTask *otask, Element *start_elem, int input_port = 0); /* Tries to run all pending computation tasks. */ diff --git a/include/nba/framework/offloadtask.hh b/include/nba/framework/offloadtask.hh index d2734b2..8f9d00a 100644 --- a/include/nba/framework/offloadtask.hh +++ b/include/nba/framework/offloadtask.hh @@ -58,8 +58,7 @@ public: void notify_completion(); public: - /* Initialized during execute(). */ - struct resource_param res; + struct resource_param res; /* Initialized during execute(). */ uint64_t offload_start; double offload_cost; size_t num_pkts; @@ -80,7 +79,7 @@ public: OffloadableElement* elem; int dbid_h2d[NBA_MAX_DATABLOCKS]; - struct datablock_kernel_arg *dbarray_h; + struct datablock_kernel_arg **dbarray_h; memory_t dbarray_d; struct ev_async *completion_watcher __cache_aligned; diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc index 75fe6d1..0aa1ec0 100644 --- a/src/engines/cuda/computecontext.cc +++ b/src/engines/cuda/computecontext.cc @@ -12,7 +12,7 @@ struct cuda_event_context { void *user_arg; }; -#define IO_BASE_SIZE (4 * 1024 * 1024) +#define IO_BASE_SIZE (16 * 1024 * 1024) #undef USE_PHYS_CONT_MEMORY // performance degraded :( CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother) diff --git a/src/lib/coprocessor.cc b/src/lib/coprocessor.cc index 295b0ea..9c35792 100644 --- a/src/lib/coprocessor.cc +++ b/src/lib/coprocessor.cc @@ -165,7 +165,7 @@ void *coproc_loop(void *arg) ctx->task_done_queue = new FixedRing(256, ctx->loc.node_id); /* Initialize the event loop. */ - ctx->loop = ev_loop_new(EVFLAG_AUTO); + ctx->loop = ev_loop_new(EVFLAG_AUTO | EVFLAG_NOSIGMASK); ctx->loop_broken = false; ev_set_userdata(ctx->loop, ctx); diff --git a/src/lib/datablock.cc b/src/lib/datablock.cc index 515c29b..d789545 100644 --- a/src/lib/datablock.cc +++ b/src/lib/datablock.cc @@ -181,8 +181,8 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) { case READ_PARTIAL_PACKET: { void *invalid_value = this->get_invalid_value(); FOR_EACH_PACKET_ALL_PREFETCH(batch, 4u) { - size_t aligned_elemsz = t->aligned_item_sizes_h->size; - size_t offset = t->aligned_item_sizes_h->size * pkt_idx; + uint16_t aligned_elemsz = t->aligned_item_sizes_h->size; + uint32_t offset = t->aligned_item_sizes_h->size * pkt_idx; if (IS_PACKET_INVALID(batch, pkt_idx)) { if (invalid_value != nullptr) { rte_memcpy((char *) host_in_buffer + offset, invalid_value, aligned_elemsz); diff --git a/src/lib/elementgraph.cc b/src/lib/elementgraph.cc index efb938f..a132dcd 100644 --- a/src/lib/elementgraph.cc +++ b/src/lib/elementgraph.cc @@ -42,7 +42,8 @@ ElementGraph::ElementGraph(comp_thread_context *ctx) #ifdef NBA_REUSE_DATABLOCKS struct rte_hash_parameters hparams; char namebuf[RTE_HASH_NAMESIZE]; - sprintf(namebuf, "elemgraph@%u.%u:offl_actions", ctx->loc.node_id, ctx->loc.local_thread_idx); + snprintf(namebuf, RTE_HASH_NAMESIZE, "elemgraph@%u.%u:offl_actions", + ctx->loc.node_id, ctx->loc.local_thread_idx); hparams.name = namebuf; hparams.entries = 64; hparams.key_len = sizeof(struct offload_action_key); @@ -97,12 +98,16 @@ void ElementGraph::send_offload_task_to_device(OffloadTask *task) bidx ++; } } - task->offload_start = 0; /* Allocate the host-device IO buffer pool. */ - if (task->io_base == INVALID_IO_BASE) + while (task->io_base == INVALID_IO_BASE) { task->io_base = cctx->alloc_io_base(); - assert(task->io_base != INVALID_IO_BASE); + if (task->io_base == INVALID_IO_BASE) { + /* If not available now, wait. */ + ev_run(ctx->io_ctx->loop, 0); + } + if (unlikely(ctx->io_ctx->loop_broken)) return; + } /* Calculate required buffer sizes, allocate them, and initialize them. * The mother buffer is statically allocated on start-up and here we @@ -214,6 +219,15 @@ void ElementGraph::enqueue_batch(PacketBatch *batch, Element *start_elem, int in queue.push_back(Task::to_task(batch)); } +void ElementGraph::enqueue_offload_task(OffloadTask *otask, OffloadableElement *start_elem, int input_port) +{ + assert(start_elem != nullptr); + otask->elem = start_elem; + otask->tracker.element = start_elem; + otask->tracker.input_port = input_port; + queue.push_back(Task::to_task(otask)); +} + void ElementGraph::enqueue_offload_task(OffloadTask *otask, Element *start_elem, int input_port) { assert(start_elem != nullptr); diff --git a/src/lib/io.cc b/src/lib/io.cc index 233138c..a3d9098 100644 --- a/src/lib/io.cc +++ b/src/lib/io.cc @@ -134,6 +134,22 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn /* Run postprocessing handlers. */ task->postprocess(); + if (ctx->elem_graph->check_postproc_all(task->elem)) { + /* Reset all datablock trackers. */ + for (PacketBatch *batch : task->batches) { + if (batch->datablock_states != nullptr) { + struct datablock_tracker *t = batch->datablock_states; + t->host_in_ptr = nullptr; + t->host_out_ptr = nullptr; + rte_mempool_put(ctx->dbstate_pool, (void *) t); + batch->datablock_states = nullptr; + } + } + /* Release per-task io_base. */ + task->cctx->clear_io_buffers(task->io_base); + ev_break(ctx->io_ctx->loop, EVBREAK_ALL); + } + /* Update statistics. */ uint64_t task_cycles = now - task->offload_start; float time_spent = (float) task_cycles / rte_get_tsc_hz(); diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc index 4ddaf04..9aa2f96 100644 --- a/src/lib/offloadtask.cc +++ b/src/lib/offloadtask.cc @@ -189,27 +189,34 @@ void OffloadTask::prepare_write_buffer() bool OffloadTask::copy_h2d() { - bool has_h2d_copies = false; state = TASK_H2D_COPYING; /* Copy the datablock information for the first kernel argument. */ - size_t dbarray_size = ALIGN_CEIL(sizeof(struct datablock_kernel_arg) * datablocks.size(), CACHE_LINE_SIZE); + size_t dbarray_size = sizeof(struct datablock_kernel_arg *) * datablocks.size(); cctx->alloc_input_buffer(io_base, dbarray_size, (void **) &dbarray_h, &dbarray_d); _debug_print_inb("copy_h2d.dbarray", nullptr, 0); assert(dbarray_h != nullptr); for (int dbid : datablocks) { int dbid_d = dbid_h2d[dbid]; - dbarray_h[dbid_d].total_item_count_in = 0; - dbarray_h[dbid_d].total_item_count_out = 0; assert(dbid_d < (signed) datablocks.size()); - DataBlock *db = comp_ctx->datablock_registry[dbid]; struct read_roi_info rri; struct write_roi_info wri; db->get_read_roi(&rri); db->get_write_roi(&wri); + struct datablock_kernel_arg *dbarg_h; + memory_t dbarg_d; + size_t dbarg_size = sizeof(struct datablock_kernel_arg) + + batches.size() * sizeof(struct datablock_batch_info); + cctx->alloc_input_buffer(io_base, dbarg_size, (void **) &dbarg_h, &dbarg_d); + assert(dbarg_h != nullptr); + + dbarray_h[dbid_d] = (struct datablock_kernel_arg *) dbarg_d.ptr; + dbarg_h->total_item_count_in = 0; + dbarg_h->total_item_count_out = 0; + int b = 0; for (PacketBatch *batch : batches) { assert(batch->datablock_states != nullptr); @@ -219,34 +226,37 @@ bool OffloadTask::copy_h2d() /* We need to copy the size array because each item may * have different lengths. */ assert(t->aligned_item_sizes_h != nullptr); - dbarray_h[dbid_d].item_sizes_in[b] = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, sizes)); - dbarray_h[dbid_d].item_sizes_out[b] = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, sizes)); - dbarray_h[dbid_d].item_offsets_in[b] = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, offsets)); - dbarray_h[dbid_d].item_offsets_out[b] = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, offsets)); + dbarg_h->batches[b].item_sizes_in = (uint16_t *) + ((char *) t->aligned_item_sizes_d.ptr + + (uintptr_t) offsetof(struct item_size_info, sizes)); + dbarg_h->batches[b].item_sizes_out = (uint16_t *) + ((char *) t->aligned_item_sizes_d.ptr + + (uintptr_t) offsetof(struct item_size_info, sizes)); + dbarg_h->batches[b].item_offsets_in = (uint32_t *) + ((char *) t->aligned_item_sizes_d.ptr + + (uintptr_t) offsetof(struct item_size_info, offsets)); + dbarg_h->batches[b].item_offsets_out = (uint32_t *) + ((char *) t->aligned_item_sizes_d.ptr + + (uintptr_t) offsetof(struct item_size_info, offsets)); } else { /* Same for all batches. * We assume the module developer knows the fixed length * when writing device kernel codes. */ - dbarray_h[dbid_d].item_size_in = rri.length; - dbarray_h[dbid_d].item_size_out = wri.length; - dbarray_h[dbid_d].item_offsets_in[b] = nullptr; - dbarray_h[dbid_d].item_offsets_out[b] = nullptr; + dbarg_h->item_size_in = rri.length; + dbarg_h->item_size_out = wri.length; + dbarg_h->batches[b].item_offsets_in = nullptr; + dbarg_h->batches[b].item_offsets_out = nullptr; } - dbarray_h[dbid_d].buffer_bases_in[b] = t->dev_in_ptr.ptr; // FIXME: generalize to CL? - dbarray_h[dbid_d].item_count_in[b] = t->in_count; - dbarray_h[dbid_d].total_item_count_in += t->in_count; - dbarray_h[dbid_d].buffer_bases_out[b] = t->dev_out_ptr.ptr; // FIXME: generalize to CL? - dbarray_h[dbid_d].item_count_out[b] = t->out_count; - dbarray_h[dbid_d].total_item_count_out += t->out_count; + dbarg_h->batches[b].buffer_bases_in = t->dev_in_ptr.ptr; // FIXME: generalize to CL? + dbarg_h->batches[b].item_count_in = t->in_count; + dbarg_h->total_item_count_in += t->in_count; + dbarg_h->batches[b].buffer_bases_out = t->dev_out_ptr.ptr; // FIXME: generalize to CL? + dbarg_h->batches[b].item_count_out = t->out_count; + dbarg_h->total_item_count_out += t->out_count; b++; } /* endfor(batches) */ } /* endfor(dbid) */ - has_h2d_copies = true; - return has_h2d_copies; + return true; } /** @@ -303,7 +313,13 @@ void OffloadTask::execute() } size_t last_alloc_size = cctx->get_input_size(io_base); - cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0, last_alloc_size - input_alloc_size_begin); + //printf("GPU-offload-h2d-size: %'lu bytes\n", last_alloc_size); + // ipv4@64B: 16K ~ 24K + // ipsec@64B: ~ 5M + cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0, + last_alloc_size - input_alloc_size_begin); + //cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0, + // 2097152); cctx->clear_checkbits(); cctx->clear_kernel_args(); @@ -403,22 +419,6 @@ void OffloadTask::postprocess() } } /* endif(check_postproc) */ } /* endfor(dbid) */ - - if (elemgraph->check_postproc_all(elem)) { - /* Reset all datablock trackers. */ - for (PacketBatch *batch : batches) { - if (batch->datablock_states != nullptr) { - struct datablock_tracker *t = batch->datablock_states; - t->host_in_ptr = nullptr; - t->host_out_ptr = nullptr; - rte_mempool_put(comp_ctx->dbstate_pool, (void *) t); - batch->datablock_states = nullptr; - } - } - /* Release per-task io_base. */ - cctx->clear_io_buffers(io_base); - //printf("%s task finished\n", elem->class_name()); - } } // vim: ts=8 sts=4 sw=4 et