Skip to content

Commit

Permalink
refs #6: Optimize again and fix intenger out-of-range bug
Browse files Browse the repository at this point in the history
 * Removes "expect_false" on EVRUN_NOWAIT in libev's ev_run() code.
   It cuts down the CPU cycle usage by ev_run() to half!
   (Still, our performance bottleneck is not on the libev itself.
   When we observe high CPU percentage on libev, it means that the CPU
   is wasting its cycles.)

 * Limits NBA_MAX_IO_BASES from 17 to 1.
   This reduces the size of memory area used by both CPU and GPU.
   It meas that we now have bottlenecks in memory/cache subsystems.

   - Adds a blocking ev_run() call to wait io_bases to become available,
     using the same technique for waiting batch pools:
     ev_run() <=> ev_break() pairs

   - This provides performance improvements.

 * Increases offset sizes from uint16_t to uint32_t for when running
   IPsec with MTU-sized packets, where offsets may exceed 65535.
   This has been the main reason of frequent errors when running IPsec
   with large-size packets (>= 512 bytes).

   - This decreases performance.

 => Above two performance improvements/degradation compensate each
    other.  So there is no performance change compared to the previous
    commit.

 * Reduces memory footprint by using variable-sized array in datablock
   arguments.  However, this does not yield significant performance
   changes because we already have "full" aggregated batches when
   offloading IPsec encryption due to computation/memory bottlenecks.
  • Loading branch information
achimnol committed Jan 18, 2016
1 parent d81c5f9 commit 17e0217
Show file tree
Hide file tree
Showing 16 changed files with 165 additions and 123 deletions.
3 changes: 2 additions & 1 deletion 3rdparty/libev/ev.c
Original file line number Diff line number Diff line change
Expand Up @@ -3604,7 +3604,8 @@ ev_run (EV_P_ int flags)

ECB_MEMORY_FENCE; /* make sure pipe_write_wanted is visible before we check for potential skips */

if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped)))
//if (expect_true (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped)))
if (!(flags & EVRUN_NOWAIT || idleall || !activecnt || pipe_write_skipped))
{
waittime = MAX_BLOCKTIME;

Expand Down
10 changes: 5 additions & 5 deletions elements/ip/IPlookup_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ __device__ uint32_t ntohl(uint32_t n)

/* The GPU kernel. */
__global__ void ipv4_route_lookup_cuda(
struct datablock_kernel_arg *datablocks,
struct datablock_kernel_arg **datablocks,
uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
uint16_t* __restrict__ TBL24_d,
Expand All @@ -43,10 +43,10 @@ __global__ void ipv4_route_lookup_cuda(
if (idx < count) {
uint16_t batch_idx = batch_ids[idx];
uint16_t item_idx = item_ids[idx];
struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv4_dest_addrs_d];
struct datablock_kernel_arg *db_results = &datablocks[dbid_ipv4_lookup_results_d];
uint32_t daddr = ((uint32_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx];
uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx];
struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv4_dest_addrs_d];
struct datablock_kernel_arg *db_results = datablocks[dbid_ipv4_lookup_results_d];
uint32_t daddr = ((uint32_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
uint16_t *lookup_result = &((uint16_t *)db_results->batches[batch_idx].buffer_bases_out)[item_idx];

if (daddr == IGNORED_IP) {
*lookup_result = 0;
Expand Down
27 changes: 14 additions & 13 deletions elements/ipsec/IPsecAES_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ __device__ void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out,
}

__global__ void AES_ctr_encrypt_chunk_SharedMem_5(
struct datablock_kernel_arg *datablocks,
struct datablock_kernel_arg **datablocks,
uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
struct aes_sa_entry* flow_info
Expand All @@ -698,25 +698,25 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
const uint16_t batch_idx = batch_ids[idx];
const uint16_t item_idx = item_ids[idx];

const struct datablock_kernel_arg *db_enc_payloads = &datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *db_iv = &datablocks[dbid_iv_d];
const struct datablock_kernel_arg *db_flow_ids = &datablocks[dbid_flow_ids_d];
const struct datablock_kernel_arg *db_aes_block_info = &datablocks[dbid_aes_block_info_d];
const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *db_iv = datablocks[dbid_iv_d];
const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d];
const struct datablock_kernel_arg *db_aes_block_info = datablocks[dbid_aes_block_info_d];

assert(batch_idx < 32);
assert(item_idx < db_aes_block_info->item_count_in[batch_idx]);
assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in);

uint64_t flow_id = 65536;
const struct aes_block_info cur_block_info = ((struct aes_block_info *)
db_aes_block_info->buffer_bases_in[batch_idx])
db_aes_block_info->batches[batch_idx].buffer_bases_in)
[item_idx];
const int pkt_idx = cur_block_info.pkt_idx;
const int block_idx_local = cur_block_info.block_idx;
const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][pkt_idx];
const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][pkt_idx];
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx];
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];

if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) {
flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[pkt_idx];
flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx];
if (flow_id != 65536)
assert(flow_id < 1024);
}
Expand All @@ -730,11 +730,11 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64) {

aes_key = flow_info[flow_id].aes_key;
iv = ((uint4 *) db_iv->buffer_bases_in[batch_idx])[pkt_idx];
iv = ((uint4 *) db_iv->batches[batch_idx].buffer_bases_in)[pkt_idx];

if (offset != 0 && length != 0) {

enc_payload = ((uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx]) + offset;
enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset;

/* Step 2. (marginal) */
for (int i = 0; i * blockDim.x < 256; i++) {
Expand Down Expand Up @@ -773,7 +773,8 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(

/* Step 5: XOR the plain text (in-place). */
uint4 *in_blk = (uint4 *) &enc_payload[block_idx_local * AES_BLOCK_SIZE];
assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= enc_payload + db_enc_payloads->item_sizes_in[batch_idx][pkt_idx]);
assert((uint8_t*)in_blk + AES_BLOCK_SIZE <=
enc_payload + db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]);
(*in_blk).x = ecounter.x ^ (*in_blk).x;
(*in_blk).y = ecounter.y ^ (*in_blk).y;
(*in_blk).z = ecounter.z ^ (*in_blk).z;
Expand Down
14 changes: 7 additions & 7 deletions elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1238,7 +1238,7 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets,
#endif

__global__ void computeHMAC_SHA1_3(
struct datablock_kernel_arg *datablocks,
struct datablock_kernel_arg **datablocks,
uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
struct hmac_sa_entry *hmac_key_array)
Expand All @@ -1248,14 +1248,14 @@ __global__ void computeHMAC_SHA1_3(
const uint16_t batch_idx = batch_ids[idx];
const uint16_t item_idx = item_ids[idx];
assert(item_idx < 64);
const struct datablock_kernel_arg *db_enc_payloads = &datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *db_flow_ids = &datablocks[dbid_flow_ids_d];
const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d];

const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->buffer_bases_in[batch_idx];
const uintptr_t offset = (uintptr_t) db_enc_payloads->item_offsets_in[batch_idx][item_idx];
const uintptr_t length = (uintptr_t) db_enc_payloads->item_sizes_in[batch_idx][item_idx];
const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx];
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
if (enc_payload_base != NULL && offset != 0 && length != 0) {
const uint64_t flow_id = ((uint64_t *) db_flow_ids->buffer_bases_in[batch_idx])[item_idx];
const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
if (flow_id != 65536 && flow_id < 1024) {
//assert(flow_id < 1024);
const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key;
Expand Down
10 changes: 5 additions & 5 deletions elements/ipv6/LookupIP6Route_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ __device__ uint64_t ntohll(uint64_t val)
}

__global__ void ipv6_route_lookup_cuda(
struct datablock_kernel_arg *datablocks,
struct datablock_kernel_arg **datablocks,
uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
Item** __restrict__ tables_d,
Expand All @@ -154,10 +154,10 @@ __global__ void ipv6_route_lookup_cuda(
if (idx < count) {
uint16_t batch_idx = batch_ids[idx];
uint16_t item_idx = item_ids[idx];
struct datablock_kernel_arg *db_dest_addrs = &datablocks[dbid_ipv6_dest_addrs_d];
struct datablock_kernel_arg *db_results = &datablocks[dbid_ipv6_lookup_results_d];
struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->buffer_bases_in[batch_idx])[item_idx];
uint16_t *lookup_result = &((uint16_t *) db_results->buffer_bases_out[batch_idx])[item_idx];
struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d];
struct datablock_kernel_arg *db_results = datablocks[dbid_ipv6_lookup_results_d];
struct _cu_uint128_t daddr = ((struct _cu_uint128_t*) db_dest_addrs->batches[batch_idx].buffer_bases_in)[item_idx];
uint16_t *lookup_result = &((uint16_t *) db_results->batches[batch_idx].buffer_bases_out)[item_idx];

// NOTE: On FERMI devices, using shared memory to store just 128
// pointers is not necessary since they have on-chip L1
Expand Down
30 changes: 15 additions & 15 deletions include/nba/engines/cuda/compat.hh
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,23 @@

#include <nba/framework/config.hh>

struct datablock_batch_info {
void *buffer_bases_in;
void *buffer_bases_out;
uint32_t item_count_in;
uint32_t item_count_out;
uint16_t *item_sizes_in;
uint16_t *item_sizes_out;
uint32_t *item_offsets_in;
uint32_t *item_offsets_out;
}; // __cuda_aligned

struct datablock_kernel_arg {
uint32_t total_item_count_in;
uint32_t total_item_count_out;
void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH];
void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
union {
uint16_t item_size_in;
uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
};
union {
uint16_t item_size_out;
uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
};
uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
};
uint16_t item_size_in; // for fixed-size cases
uint16_t item_size_out; // for fixed-size cases
struct datablock_batch_info batches[0];
}; // __cuda_aligned

#endif
2 changes: 1 addition & 1 deletion include/nba/framework/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

#define NBA_MAX_TASKPOOL_SIZE (2048u)
#define NBA_MAX_BATCHPOOL_SIZE (2048u)
#define NBA_MAX_IO_BASES (17)
#define NBA_MAX_IO_BASES (1)

#define NBA_MAX_ANNOTATION_SET_SIZE (7)
#define NBA_MAX_NODELOCALSTORAGE_ENTRIES (16)
Expand Down
56 changes: 33 additions & 23 deletions include/nba/framework/datablock.hh
Original file line number Diff line number Diff line change
Expand Up @@ -79,22 +79,24 @@ struct item_size_info {
uint16_t size;
uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12];
};
uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
};
#else
struct item_size_info {
union {
uint16_t size;
uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96];
};
uint16_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
};
#endif

/** Datablock tracking struct.
/**
* Datablock tracking struct.
*
* It resides in PacketBatch as a static array, and keeps track of the
* status of data blocks attached to the batch.
* It contains information required to reuse device buffers that are
* already copied to the device. It resides in PacketBatch as a static
* array.
*/
struct datablock_tracker {
void *host_in_ptr;
Expand All @@ -105,30 +107,38 @@ struct datablock_tracker {
size_t in_count;
size_t out_size;
size_t out_count;
//struct item_size_info exact_item_sizes;
struct item_size_info *aligned_item_sizes_h;
memory_t aligned_item_sizes_d;
};

/* NOTE: The alignment of this struct should match with CUDA. */
/**
* Datablock batch info struct.
*
* It contains item offset/size information for variable-length datablocks.
*
* NOTE: The alignment of this struct should match with CUDA.
*/
struct datablock_batch_info {
void *buffer_bases_in;
void *buffer_bases_out;
uint32_t item_count_in;
uint32_t item_count_out;
uint16_t *item_sizes_in;
uint16_t *item_sizes_out;
uint32_t *item_offsets_in;
uint32_t *item_offsets_out;
}; // __cuda_aligned

/**
* Kernel argument spec for datablocks.
*/
struct datablock_kernel_arg {
uint32_t total_item_count_in;
uint32_t total_item_count_out;
void *buffer_bases_in[NBA_MAX_COPROC_PPDEPTH];
void *buffer_bases_out[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_in[NBA_MAX_COPROC_PPDEPTH];
uint32_t item_count_out[NBA_MAX_COPROC_PPDEPTH];
union {
uint16_t item_size_in;
uint16_t *item_sizes_in[NBA_MAX_COPROC_PPDEPTH];
};
union {
uint16_t item_size_out;
uint16_t *item_sizes_out[NBA_MAX_COPROC_PPDEPTH];
};
uint16_t *item_offsets_in[NBA_MAX_COPROC_PPDEPTH];
uint16_t *item_offsets_out[NBA_MAX_COPROC_PPDEPTH];
}; // __attribute__((aligned(8)));
uint16_t item_size_in; // for fixed-size cases
uint16_t item_size_out; // for fixed-size cases
struct datablock_batch_info batches[0];
}; // __cuda_aligned


/** Datablock information class.
Expand Down Expand Up @@ -165,7 +175,7 @@ public:
void preprocess(PacketBatch *batch, void *host_ptr);
void postprocess(OffloadableElement *elem, int input_port, PacketBatch *batch, void *host_ptr);

/* Below methods arre used only when ROI type is USER_PREPROC/USER_POSTPROC. */
/* Below methods are used only when ROI type is USER_PREPROC/USER_POSTPROC. */
virtual void calculate_read_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count)
{ out_bytes = 0; out_count = 0; }
virtual void calculate_write_buffer_size(PacketBatch *batch, size_t &out_bytes, size_t &out_count)
Expand Down
1 change: 1 addition & 0 deletions include/nba/framework/elementgraph.hh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ public:
/* Inserts the given batch/offloadtask to the internal task queue.
* This does not execute the pipeline; call flush_tasks() for that. */
void enqueue_batch(PacketBatch *batch, Element *start_elem, int input_port = 0);
void enqueue_offload_task(OffloadTask *otask, OffloadableElement *start_elem, int input_port = 0);
void enqueue_offload_task(OffloadTask *otask, Element *start_elem, int input_port = 0);

/* Tries to run all pending computation tasks. */
Expand Down
5 changes: 2 additions & 3 deletions include/nba/framework/offloadtask.hh
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ public:
void notify_completion();

public:
/* Initialized during execute(). */
struct resource_param res;
struct resource_param res; /* Initialized during execute(). */
uint64_t offload_start;
double offload_cost;
size_t num_pkts;
Expand All @@ -80,7 +79,7 @@ public:
OffloadableElement* elem;
int dbid_h2d[NBA_MAX_DATABLOCKS];

struct datablock_kernel_arg *dbarray_h;
struct datablock_kernel_arg **dbarray_h;
memory_t dbarray_d;

struct ev_async *completion_watcher __cache_aligned;
Expand Down
2 changes: 1 addition & 1 deletion src/engines/cuda/computecontext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ struct cuda_event_context {
void *user_arg;
};

#define IO_BASE_SIZE (4 * 1024 * 1024)
#define IO_BASE_SIZE (16 * 1024 * 1024)
#undef USE_PHYS_CONT_MEMORY // performance degraded :(

CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother)
Expand Down
2 changes: 1 addition & 1 deletion src/lib/coprocessor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ void *coproc_loop(void *arg)
ctx->task_done_queue = new FixedRing<OffloadTask *, nullptr>(256, ctx->loc.node_id);

/* Initialize the event loop. */
ctx->loop = ev_loop_new(EVFLAG_AUTO);
ctx->loop = ev_loop_new(EVFLAG_AUTO | EVFLAG_NOSIGMASK);
ctx->loop_broken = false;
ev_set_userdata(ctx->loop, ctx);

Expand Down
4 changes: 2 additions & 2 deletions src/lib/datablock.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) {
case READ_PARTIAL_PACKET: {
void *invalid_value = this->get_invalid_value();
FOR_EACH_PACKET_ALL_PREFETCH(batch, 4u) {
size_t aligned_elemsz = t->aligned_item_sizes_h->size;
size_t offset = t->aligned_item_sizes_h->size * pkt_idx;
uint16_t aligned_elemsz = t->aligned_item_sizes_h->size;
uint32_t offset = t->aligned_item_sizes_h->size * pkt_idx;
if (IS_PACKET_INVALID(batch, pkt_idx)) {
if (invalid_value != nullptr) {
rte_memcpy((char *) host_in_buffer + offset, invalid_value, aligned_elemsz);
Expand Down
Loading

0 comments on commit 17e0217

Please sign in to comment.