Skip to content

Commit

Permalink
refs #6, #8: Use ShiftedInt<uint16_t, 2> instead of uint32_t for offsets
Browse files Browse the repository at this point in the history
 * This restores former performance improvements!
   (Now ~3.4 Gbps per node with IPsec@64B, previously ~2.6 Gbps)
  • Loading branch information
achimnol committed Jan 22, 2016
1 parent 7f99ef1 commit f041e78
Show file tree
Hide file tree
Showing 8 changed files with 30 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ if v: CFLAGS += ' -DNBA_RANDOM_PORT_ACCESS'
# NVIDIA CUDA configurations
if USE_CUDA:
CUDA_ARCHS = compilelib.get_cuda_arch()
NVCFLAGS = '-O2 -g -std=c++11 --use_fast_math -Iinclude -I/usr/local/cuda/include'
NVCFLAGS = '-O2 -g -std=c++11 --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include'
CFLAGS += ' -I/usr/local/cuda/include'
LIBS += ' -L/usr/local/cuda/lib64 -lcudart' #' -lnvidia-ml'
print(CUDA_ARCHS)
Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAES_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -712,7 +712,7 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
[item_idx];
const int pkt_idx = cur_block_info.pkt_idx;
const int block_idx_local = cur_block_info.block_idx;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx];
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];

if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) {
Expand Down Expand Up @@ -792,4 +792,4 @@ void *nba::ipsec_aes_encryption_get_cuda_kernel() {
return reinterpret_cast<void *> (AES_ctr_encrypt_chunk_SharedMem_5);
}

// vim: ts=8 sts=4 sw=4 et
// vim: ts=8 sts=4 sw=4 et tw=150
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1252,7 +1252,7 @@ __global__ void computeHMAC_SHA1_3(
const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d];

const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx];
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
if (enc_payload_base != NULL && offset != 0 && length != 0) {
const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
Expand All @@ -1277,4 +1277,4 @@ void *nba::ipsec_hsha1_encryption_get_cuda_kernel() {
return reinterpret_cast<void *> (computeHMAC_SHA1_3);
}

// vim: ts=8 sts=4 sw=4 et
// vim: ts=8 sts=4 sw=4 et tw=150
13 changes: 11 additions & 2 deletions include/nba/core/shiftedint.hh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@
#include <cassert>
#include <exception>

#ifndef __CUDACC__
#ifndef __host__
#define __host__
#endif
#ifndef __device__
#define __device__
#endif
#endif

namespace nba {

#ifdef DEBUG
Expand All @@ -28,7 +37,7 @@ template<typename Int, unsigned Shift>
class ShiftedInt
{
private:
typedef __uint128_t LARGE_INT;
typedef uint64_t LARGE_INT;
static_assert(std::is_integral<Int>::value, "Integer type required.");

Int shifted_value;
Expand Down Expand Up @@ -177,7 +186,7 @@ private:
}

template<typename ReturnInt>
ReturnInt as_value() {
__host__ __device__ inline ReturnInt as_value() const {
static_assert(std::numeric_limits<ReturnInt>::max()
>= ((LARGE_INT)std::numeric_limits<Int>::max() << Shift),
"return type is not large enough.");
Expand Down
6 changes: 4 additions & 2 deletions include/nba/engines/cuda/compat.hh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
* Note that the nvcc should support C++11 (CUDA v6.5 or higher).
*/

#include <cstdint>
#include <nba/core/shiftedint.hh>
#include <nba/framework/config.hh>

struct datablock_batch_info {
Expand All @@ -16,8 +18,8 @@ struct datablock_batch_info {
uint32_t item_count_out;
uint16_t *item_sizes_in;
uint16_t *item_sizes_out;
uint32_t *item_offsets_in;
uint32_t *item_offsets_out;
nba::ShiftedInt<uint16_t, 2> *item_offsets_in;
nba::ShiftedInt<uint16_t, 2> *item_offsets_out;
}; // __cuda_aligned

struct datablock_kernel_arg {
Expand Down
9 changes: 5 additions & 4 deletions include/nba/framework/datablock.hh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <nba/framework/config.hh>
#include <nba/framework/computecontext.hh>
#include <nba/core/shiftedint.hh> // should come after cuda headers
#include <vector>
#include <string>
#include <tuple>
Expand Down Expand Up @@ -79,15 +80,15 @@ struct item_size_info {
uint16_t size;
uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12];
};
uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12];
ShiftedInt<uint16_t, 2>[NBA_MAX_COMP_BATCH_SIZE * 12];
};
#else
struct item_size_info {
union {
uint16_t size;
uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96];
};
uint32_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
ShiftedInt<uint16_t, 2> offsets[NBA_MAX_COMP_BATCH_SIZE * 96];
};
#endif

Expand Down Expand Up @@ -125,8 +126,8 @@ struct datablock_batch_info {
uint32_t item_count_out;
uint16_t *item_sizes_in;
uint16_t *item_sizes_out;
uint32_t *item_offsets_in;
uint32_t *item_offsets_out;
ShiftedInt<uint16_t, 2> *item_offsets_in;
ShiftedInt<uint16_t, 2> *item_offsets_out;
}; // __cuda_aligned

/**
Expand Down
4 changes: 2 additions & 2 deletions src/lib/datablock.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) {
if (IS_PACKET_INVALID(batch, pkt_idx))
continue;
size_t aligned_elemsz = t->aligned_item_sizes_h->sizes[pkt_idx];
size_t offset = t->aligned_item_sizes_h->offsets[pkt_idx];
size_t offset = t->aligned_item_sizes_h->offsets[pkt_idx].as_value<size_t>();
rte_memcpy((char*) host_in_buffer + offset,
rte_pktmbuf_mtod(batch->packets[pkt_idx], char*) + read_roi.offset,
aligned_elemsz);
Expand Down Expand Up @@ -250,7 +250,7 @@ void DataBlock::postprocess(OffloadableElement *elem, int input_port, PacketBatc
t->aligned_item_sizes_h->sizes[pkt_idx]);
size_t offset = bitselect<size_t>(write_roi.type == WRITE_PARTIAL_PACKET,
t->aligned_item_sizes_h->size * pkt_idx,
t->aligned_item_sizes_h->offsets[pkt_idx]);
t->aligned_item_sizes_h->offsets[pkt_idx].as_value<size_t>());
rte_memcpy(rte_pktmbuf_mtod(batch->packets[pkt_idx], char*) + write_roi.offset,
(char*) host_out_ptr + offset,
elemsz);
Expand Down
5 changes: 3 additions & 2 deletions src/lib/offloadtask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <nba/engines/cuda/computedevice.hh>
#include <nba/engines/cuda/computecontext.hh>
#endif
#include <nba/core/shiftedint.hh> // should come after cuda headers
#include <tuple>
#include <ev.h>
#include <rte_memcpy.h>
Expand Down Expand Up @@ -234,10 +235,10 @@ bool OffloadTask::copy_h2d()
dbarg_h->batches[b].item_sizes_out = (uint16_t *)
((char *) t->aligned_item_sizes_d.ptr
+ (uintptr_t) offsetof(struct item_size_info, sizes));
dbarg_h->batches[b].item_offsets_in = (uint32_t *)
dbarg_h->batches[b].item_offsets_in = (ShiftedInt<uint16_t, 2> *)
((char *) t->aligned_item_sizes_d.ptr
+ (uintptr_t) offsetof(struct item_size_info, offsets));
dbarg_h->batches[b].item_offsets_out = (uint32_t *)
dbarg_h->batches[b].item_offsets_out = (ShiftedInt<uint16_t, 2> *)
((char *) t->aligned_item_sizes_d.ptr
+ (uintptr_t) offsetof(struct item_size_info, offsets));
} else {
Expand Down

0 comments on commit f041e78

Please sign in to comment.