Skip to content

Commit

Permalink
refs #6: Fix memory corruption bug.
Browse files Browse the repository at this point in the history
 * IPsec works well with task/datablock reuse optimization.
   - However, this refactored architecture has high libev/syscall
     overheads (> 10% CPU cycles of worker threads).
   - TODO: optimize it...

 * When reusing tasks,
   - we should keep task itself (do not free it!) and
   - we should update task->elem as well as task->tracker.element

 * There was a serious bug that reused GPU input buffer for outputs
   (for cases when roi/wri is WHOLE_PACKET) are not actually included
   in device-to-host copies, resulting in NO take-back of computation
   results.
   - Currently we allocate an output buffer explicitly without such
     buffer reuse optimization.
   - TODO: reuse input buffer and include its offset/lengths to
     coalescing of d2h copies
  • Loading branch information
achimnol committed Jan 10, 2016
1 parent f63dd07 commit 309ab57
Show file tree
Hide file tree
Showing 12 changed files with 103 additions and 45 deletions.
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAES.hh
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ public:
size_t get_used_datablocks(int *datablock_ids)
{
datablock_ids[0] = dbid_enc_payloads;
datablock_ids[1] = dbid_iv;
datablock_ids[2] = dbid_flow_ids;
datablock_ids[1] = dbid_flow_ids;
datablock_ids[2] = dbid_iv;
datablock_ids[3] = dbid_aes_block_info;
return 4;
}
Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAES_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

/* The index is given by the order in get_used_datablocks(). */
#define dbid_enc_payloads_d (0)
#define dbid_iv_d (1)
#define dbid_flow_ids_d (2)
#define dbid_flow_ids_d (1)
#define dbid_iv_d (2)
#define dbid_aes_block_info_d (3)

#ifndef __AES_CORE__ /*same constants are defined in ssl/aes/aes_core.h */
Expand Down
16 changes: 8 additions & 8 deletions elements/ipsec/IPsecDatablocks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
namespace nba {

int dbid_enc_payloads;
int dbid_iv;
int dbid_flow_ids;
int dbid_iv;
int dbid_aes_block_info;

static DataBlock* db_enc_payloads_ctor (void) {
Expand All @@ -14,18 +14,18 @@ static DataBlock* db_enc_payloads_ctor (void) {
new (ptr) IPsecEncryptedPayloadDataBlock();
return ptr;
};
static DataBlock* db_iv_ctor (void) {
DataBlock *ptr = (DataBlock *) rte_malloc("datablock", sizeof(IPsecIVDataBlock), CACHE_LINE_SIZE);
assert(ptr != nullptr);
new (ptr) IPsecIVDataBlock();
return ptr;
};
static DataBlock* db_flow_ids_ctor (void) {
DataBlock *ptr = (DataBlock *) rte_malloc("datablock", sizeof(IPsecFlowIDsDataBlock), CACHE_LINE_SIZE);
assert(ptr != nullptr);
new (ptr) IPsecFlowIDsDataBlock();
return ptr;
};
static DataBlock* db_iv_ctor (void) {
DataBlock *ptr = (DataBlock *) rte_malloc("datablock", sizeof(IPsecIVDataBlock), CACHE_LINE_SIZE);
assert(ptr != nullptr);
new (ptr) IPsecIVDataBlock();
return ptr;
};
static DataBlock* db_aes_block_info_ctor (void) {
DataBlock *ptr = (DataBlock *) rte_malloc("datablock", sizeof(IPsecAESBlockInfoDataBlock), CACHE_LINE_SIZE);
assert(ptr != nullptr);
Expand All @@ -34,8 +34,8 @@ static DataBlock* db_aes_block_info_ctor (void) {
};

declare_datablock("ipsec.enc_payloads", db_enc_payloads_ctor, dbid_enc_payloads);
declare_datablock("ipsec.iv", db_iv_ctor, dbid_iv);
declare_datablock("ipsec.flow_ids", db_flow_ids_ctor, dbid_flow_ids);
declare_datablock("ipsec.iv", db_iv_ctor, dbid_iv);
declare_datablock("ipsec.aes_block_info", db_aes_block_info_ctor, dbid_aes_block_info);

}
Expand Down
3 changes: 3 additions & 0 deletions include/nba/engines/cuda/computecontext.hh
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ private:
CPUMemoryPool _cpu_mempool_in[NBA_MAX_IO_BASES];
CPUMemoryPool _cpu_mempool_out[NBA_MAX_IO_BASES];

void *dummy_host_buf;
memory_t dummy_dev_buf;

size_t num_kernel_args;
struct kernel_arg kernel_args[CUDA_MAX_KERNEL_ARGS];

Expand Down
4 changes: 2 additions & 2 deletions include/nba/engines/cuda/mempool.hh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public:
size_t offset;
int ret = _alloc(size, &offset);
if (ret == 0)
return (void *) ((uint8_t *) base + (uintptr_t) offset);
return (void *) ((uintptr_t) base + offset);
return NULL;
}

Expand Down Expand Up @@ -85,7 +85,7 @@ public:
size_t offset;
int ret = _alloc(size, &offset);
if (ret == 0)
return (void *) ((uint8_t *) base + (uintptr_t) offset);
return (void *) ((uintptr_t) base + offset);
return NULL;
}

Expand Down
4 changes: 3 additions & 1 deletion include/nba/framework/offloadtask.hh
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ public:
struct ev_async *completion_watcher __cache_aligned;
struct rte_ring *completion_queue __cache_aligned;

private:
uint64_t task_id; // for deubgging
private:
friend class OffloadableElement;

void *host_write_begin;
Expand All @@ -97,6 +97,8 @@ private:
size_t input_alloc_size_begin;
size_t output_alloc_size_begin;

size_t last_input_size;
size_t last_output_size;
};

}
Expand Down
9 changes: 9 additions & 0 deletions src/engines/cuda/computecontext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_de
_cpu_mempool_in[i].init_with_flags(io_base_size, cudaHostAllocPortable);
_cpu_mempool_out[i].init_with_flags(io_base_size, cudaHostAllocPortable);
}
{
void *t;
cutilSafeCall(cudaMalloc((void **) &t, 64));
dummy_dev_buf.ptr = t;
cutilSafeCall(cudaHostAlloc((void **) &t, 64, cudaHostAllocPortable));
dummy_host_buf = t;
}
cutilSafeCall(cudaHostAlloc((void **) &checkbits_h, MAX_BLOCKS, cudaHostAllocMapped));
cutilSafeCall(cudaHostGetDevicePointer((void **) &checkbits_d, checkbits_h, 0));
assert(checkbits_h != NULL);
Expand Down Expand Up @@ -111,12 +118,14 @@ void CUDAComputeContext::clear_io_buffers(io_base_t io_base)

int CUDAComputeContext::enqueue_memwrite_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size)
{
//cutilSafeCall(cudaMemcpyAsync(dummy_dev_buf.ptr, dummy_host_buf, 64, cudaMemcpyHostToDevice, _stream));
cutilSafeCall(cudaMemcpyAsync(dev_buf.ptr, host_buf, size, cudaMemcpyHostToDevice, _stream));
return 0;
}

int CUDAComputeContext::enqueue_memread_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size)
{
//cutilSafeCall(cudaMemcpyAsync(dummy_host_buf, dummy_dev_buf.ptr, 64, cudaMemcpyDeviceToHost, _stream));
cutilSafeCall(cudaMemcpyAsync(host_buf, dev_buf.ptr, size, cudaMemcpyDeviceToHost, _stream));
return 0;
}
Expand Down
3 changes: 1 addition & 2 deletions src/lib/coprocessor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <nba/core/intrinsic.hh>
#include <nba/core/threading.hh>
#include <nba/core/queue.hh>
#include <nba/element/packetbatch.hh>
#include <nba/framework/threadcontext.hh>
#include <nba/framework/logging.hh>
#include <nba/framework/computation.hh>
Expand Down Expand Up @@ -56,7 +57,6 @@ static void coproc_task_input_cb(struct ev_loop *loop, struct ev_async *watcher,
* and libev will call them first and then call earlier steps again. */
ret = rte_ring_dequeue(ctx->task_input_queue, (void **) &task);
if (task != nullptr) {
assert(task->cctx != nullptr);
task->coproc_ctx = ctx;
task->copy_h2d();
task->execute();
Expand Down Expand Up @@ -87,7 +87,6 @@ static void coproc_task_d2h_cb(struct ev_loop *loop, struct ev_async *watcher, i
if (ctx->d2h_pending_queue->size() > 0) {
OffloadTask *task = ctx->d2h_pending_queue->front();
ctx->d2h_pending_queue->pop_front();
assert(task != nullptr);
if (task->poll_kernel_finished()) {
//task->cctx->sync();
task->copy_d2h();
Expand Down
4 changes: 2 additions & 2 deletions src/lib/element.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,9 @@ int OffloadableElement::offload(ElementGraph *mother, OffloadTask *otask, int in
int dev_idx = 0;
uint64_t now = rte_rdtsc();
otask->state = TASK_INITIALIZING;
otask->task_id = task_id ++;
otask->task_id += 100000; // for debugging
otask->offload_start = now;
otask->state = TASK_INITIALIZED;
otask->state = TASK_PREPARED;
mother->ready_tasks[dev_idx].push_back(otask);
/* This should always succeed. */
return 0;
Expand Down
3 changes: 2 additions & 1 deletion src/lib/elementgraph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void ElementGraph::flush_offloaded_tasks()
task->cctx = cctx;

if (task->state < TASK_PREPARED) {
bool had_io_base = (task->io_base != INVALID_IO_BASE);
bool has_io_base = false;
if (task->io_base == INVALID_IO_BASE) {
task->io_base = cctx->alloc_io_base();
Expand Down Expand Up @@ -174,6 +175,7 @@ void ElementGraph::enqueue_batch(PacketBatch *batch, Element *start_elem, int in
void ElementGraph::enqueue_offload_task(OffloadTask *otask, Element *start_elem, int input_port)
{
assert(start_elem != nullptr);
otask->elem = dynamic_cast<OffloadableElement*>(start_elem);
otask->tracker.element = start_elem;
otask->tracker.input_port = input_port;
queue.push_back(Task::to_task(otask));
Expand Down Expand Up @@ -533,7 +535,6 @@ void ElementGraph::process_offload_task(OffloadTask *otask)
{
Element *current_elem = otask->tracker.element;
OffloadableElement *offloadable = dynamic_cast<OffloadableElement*>(current_elem);
assert(offloadable != nullptr);
assert(offloadable->offload(this, otask, otask->tracker.input_port) == 0);
}

Expand Down
11 changes: 6 additions & 5 deletions src/lib/io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,19 +151,20 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn
ctx->elem_graph->enqueue_offload_task(task,
ctx->elem_graph->get_first_next(task->elem),
0);
/* This task is reused. We keep them intact. */
} else {
for (size_t b = 0, b_max = task->batches.size(); b < b_max; b ++) {
task->batches[b]->compute_time += (uint64_t)
((float) task_cycles / total_batch_size
- ((float) task->batches[b]->delay_time / task->batches[b]->count));
task->elem->enqueue_batch(task->batches[b]);
}
}

/* Free the task object. */
task->cctx = nullptr;
task->~OffloadTask();
rte_mempool_put(ctx->task_pool, (void *) task);
/* Free the task object. */
task->cctx = nullptr;
task->~OffloadTask();
rte_mempool_put(ctx->task_pool, (void *) task);
}

/* Free the resources used for this offload task. */
cctx->currently_running_task = nullptr;
Expand Down
Loading

0 comments on commit 309ab57

Please sign in to comment.