Finally fix IPsec GPU-only memory corruption bug.

* BIG THANKS to Keunhong for implementing void PMD and "slow" RX mode to emulate non-full RX computation batches and narrow down the scope. * Wrong item_idx vs. pkt_idx ........ orz * Wrong check of offset/length pairs. offset can be zero but length shouldn't be zero (if so, must be skipped).
ANLAB-KAIST · Mar 3, 2016 · 584ead2 · 584ead2
1 parent 56f1437
commit 584ead2
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 8 deletions.
diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu
@@ -710,10 +710,10 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
                                                   [item_idx];
     const int pkt_idx         = cur_block_info.pkt_idx;
     const int block_idx_local = cur_block_info.block_idx;
-    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value<uintptr_t>();
-    const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
+    const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value<uintptr_t>();
+    const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];
 
-    if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) {
+    if (cur_block_info.magic == 85739 && pkt_idx < 64 && length != 0) {
         flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx];
         if (flow_id != 65536)
             assert(flow_id < 1024);
@@ -739,9 +739,9 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
 
     __syncthreads();
 
-    if (flow_id != 65536) {
+    if (flow_id != 65536 && length != 0) {
+        assert(flow_id < 1024);
         assert(pkt_idx < 64);
-        assert(length != 0);
 
         const uint8_t *const aes_key = flows[flow_id].aes_key;
         uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases_in

diff --git a/src/lib/coprocessor.cc b/src/lib/coprocessor.cc
@@ -60,7 +60,9 @@ static void coproc_task_input_cb(struct ev_loop *loop, struct ev_async *watcher,
         task->coproc_ctx = ctx;
         task->copy_h2d();
         task->execute();
-        //task->cctx->sync(); // for DEBUG
+        #ifdef DEBUG_OFFLOAD
+        task->cctx->sync();
+        #endif
         /* We separate d2h copy step since CUDA implicitly synchronizes
          * kernel executions. See more details at:
          * http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#implicit-synchronization */
@@ -90,7 +92,9 @@ static void coproc_task_d2h_cb(struct ev_loop *loop, struct ev_async *watcher, i
         ctx->d2h_pending_queue->pop_front();
         if (task->poll_kernel_finished()) {
             task->copy_d2h();
-            //task->cctx->sync(); // for DEBUG
+            #ifdef DEBUG_OFFLOAD
+            task->cctx->sync();
+            #endif
             ctx->task_done_queue->push_back(task);
             if (ctx->task_done_queue->size() >= NBA_MAX_KERNEL_OVERLAP || !ev_is_pending(ctx->task_input_watcher))
                 ev_feed_event(loop, ctx->task_done_watcher, EV_ASYNC);

diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc
@@ -48,7 +48,7 @@ OffloadTask::~OffloadTask()
 {
 }
 
-#ifdef DEBUG
+#ifdef DEBUG_OFFLOAD
 #define _debug_print_inb(tag, batch, dbid) { \
     size_t end = cctx->get_input_size(io_base); \
     size_t len = end - last_input_size; \