diff --git a/include/nba/core/shiftedint.hh b/include/nba/core/shiftedint.hh index 73f9d9f..5c098a3 100644 --- a/include/nba/core/shiftedint.hh +++ b/include/nba/core/shiftedint.hh @@ -194,6 +194,10 @@ private: } }; +// Changing this to ShiftedInt or uint32_t exactly +// reproduces the performance drop when we first changed offset types. +typedef ShiftedInt dev_offset_t; + } /* endns(nba) */ #endif /* __NBA_CORE_SHIFTEDINT_HH__ */ diff --git a/include/nba/engines/cuda/compat.hh b/include/nba/engines/cuda/compat.hh index a170917..5419c32 100644 --- a/include/nba/engines/cuda/compat.hh +++ b/include/nba/engines/cuda/compat.hh @@ -18,8 +18,8 @@ struct datablock_batch_info { uint32_t item_count_out; uint16_t *item_sizes_in; uint16_t *item_sizes_out; - nba::ShiftedInt *item_offsets_in; - nba::ShiftedInt *item_offsets_out; + nba::dev_offset_t *item_offsets_in; + nba::dev_offset_t *item_offsets_out; }; // __cuda_aligned struct datablock_kernel_arg { diff --git a/include/nba/framework/datablock.hh b/include/nba/framework/datablock.hh index b9bfaba..c36a474 100644 --- a/include/nba/framework/datablock.hh +++ b/include/nba/framework/datablock.hh @@ -80,7 +80,7 @@ struct item_size_info { uint16_t size; uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 12]; }; - ShiftedInt[NBA_MAX_COMP_BATCH_SIZE * 12]; + dev_offset_t offsets[NBA_MAX_COMP_BATCH_SIZE * 12]; }; #else struct item_size_info { @@ -88,7 +88,7 @@ struct item_size_info { uint16_t size; uint16_t sizes[NBA_MAX_COMP_BATCH_SIZE * 96]; }; - ShiftedInt offsets[NBA_MAX_COMP_BATCH_SIZE * 96]; + dev_offset_t offsets[NBA_MAX_COMP_BATCH_SIZE * 96]; }; #endif @@ -126,8 +126,8 @@ struct datablock_batch_info { uint32_t item_count_out; uint16_t *item_sizes_in; uint16_t *item_sizes_out; - ShiftedInt *item_offsets_in; - ShiftedInt *item_offsets_out; + dev_offset_t *item_offsets_in; + dev_offset_t *item_offsets_out; }; // __cuda_aligned /** diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc index 0a0634a..151e256 100644 --- a/src/lib/offloadtask.cc +++ b/src/lib/offloadtask.cc @@ -235,10 +235,10 @@ bool OffloadTask::copy_h2d() dbarg_h->batches[b].item_sizes_out = (uint16_t *) ((char *) t->aligned_item_sizes_d.ptr + (uintptr_t) offsetof(struct item_size_info, sizes)); - dbarg_h->batches[b].item_offsets_in = (ShiftedInt *) + dbarg_h->batches[b].item_offsets_in = (dev_offset_t *) ((char *) t->aligned_item_sizes_d.ptr + (uintptr_t) offsetof(struct item_size_info, offsets)); - dbarg_h->batches[b].item_offsets_out = (ShiftedInt *) + dbarg_h->batches[b].item_offsets_out = (dev_offset_t *) ((char *) t->aligned_item_sizes_d.ptr + (uintptr_t) offsetof(struct item_size_info, offsets)); } else {