Skip to content

Commit

Permalink
more general thread local storage
Browse files Browse the repository at this point in the history
Signed-off-by: Dan Hoeflinger <[email protected]>
  • Loading branch information
danhoeflinger committed Dec 20, 2024
1 parent e95689d commit 6b5019d
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 37 deletions.
22 changes: 7 additions & 15 deletions include/oneapi/dpl/pstl/algorithm_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,6 @@ namespace dpl
namespace __internal
{

template <class _ValueType>
auto
__make_thread_enumerable_storage(std::size_t __num_elements, _ValueType __init_value)
{
return __par_backend::__thread_enumerable_storage{__num_elements, __init_value};
}

//------------------------------------------------------------------------
// any_of
//------------------------------------------------------------------------
Expand Down Expand Up @@ -4338,35 +4331,34 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando
_DiffType __n = __last - __first;
if (__n > 0)
{
auto __thread_enumerable_storage =
oneapi::dpl::__internal::__make_thread_enumerable_storage(__num_bins, _HistogramValueT{0});
__par_backend::__thread_enumerable_storage<std::vector<_HistogramValueT>> __tls{__num_bins, _HistogramValueT{0}};

//main histogram loop
//TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead
__par_backend::__parallel_for(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last,
[__func, &__thread_enumerable_storage](_RandomAccessIterator1 __first_local,
[__func, &__tls](_RandomAccessIterator1 __first_local,
_RandomAccessIterator1 __last_local) {
__internal::__brick_histogram(__first_local, __last_local, __func,
__thread_enumerable_storage.get(), _IsVector{});
__tls.get().begin(), _IsVector{});
});
// now accumulate temporary storage into output global histogram
__par_backend::__parallel_for(
__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __histogram_first,
__histogram_first + __num_bins,
[__histogram_first, &__thread_enumerable_storage](auto __global_histogram_first,
[__histogram_first, &__tls](auto __global_histogram_first,
auto __global_histogram_last) {
_DiffType __local_n = __global_histogram_last - __global_histogram_first;
std::size_t __num_temporary_copies = __thread_enumerable_storage.size();
std::size_t __num_temporary_copies = __tls.size();
_DiffType __range_begin_id = __global_histogram_first - __histogram_first;
//initialize output global histogram with first local histogram via assign
__internal::__brick_walk2_n(__thread_enumerable_storage.get_with_id(0) + __range_begin_id, __local_n,
__internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n,
__global_histogram_first, oneapi::dpl::__internal::__pstl_assign(),
_IsVector{});
for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i)
{
//accumulate into output global histogram with other local histogram via += operator
__internal::__brick_walk2_n(
__thread_enumerable_storage.get_with_id(__i) + __range_begin_id, __local_n,
__tls.get_with_id(__i).begin() + __range_begin_id, __local_n,
__global_histogram_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; },
_IsVector{});
}
Expand Down
17 changes: 9 additions & 8 deletions include/oneapi/dpl/pstl/omp/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,20 +153,21 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu
__f(__first, __last);
}

template <typename _ValueType>
template <typename _StorageType>
struct __thread_enumerable_storage
{
__thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value)
template <typename... Args>
__thread_enumerable_storage(Args&&... args)
{
_PSTL_PRAGMA(omp parallel)
_PSTL_PRAGMA(omp single nowait)
{
__num_threads = omp_get_num_threads();
__thread_specific_storage.resize(__num_threads);
_PSTL_PRAGMA(omp taskloop shared(__thread_specific_storage, __num_bins, __init_value))
_PSTL_PRAGMA(omp taskloop shared(__thread_specific_storage))
for (std::size_t __tid = 0; __tid < __num_threads; ++__tid)
{
__thread_specific_storage[__tid].resize(__num_bins, __init_value);
__thread_specific_storage[__tid] = std::make_unique<_StorageType>(std::forward<Args>(args)...);
}
}
}
Expand All @@ -177,19 +178,19 @@ struct __thread_enumerable_storage
return __num_threads;
}

auto
_StorageType&
get_with_id(std::size_t __i)
{
return __thread_specific_storage[__i].begin();
return *__thread_specific_storage[__i];
}

auto
_StorageType&
get()
{
return get_with_id(omp_get_thread_num());
}

std::vector<std::vector<_ValueType>> __thread_specific_storage;
std::vector<std::unique_ptr<_StorageType>> __thread_specific_storage;
std::size_t __num_threads;
};

Expand Down
13 changes: 7 additions & 6 deletions include/oneapi/dpl/pstl/parallel_backend_serial.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag)
{
}

template <typename _ValueType>
template <typename _StorageType>
struct __thread_enumerable_storage
{
__thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) : __storage(__num_bins, __init_value)
template <typename... Args>
__thread_enumerable_storage(Args&&... args) : __storage(std::forward<Args>(args)...)
{
}

Expand All @@ -55,19 +56,19 @@ struct __thread_enumerable_storage
return std::size_t{1};
}

auto
_StorageType&
get()
{
return __storage.begin();
return __storage;
}

auto
_StorageType&
get_with_id(std::size_t __i)
{
return get();
}

std::vector<_ValueType> __storage;
_StorageType __storage;
};

template <class _ExecutionPolicy, class _Index, class _Fp>
Expand Down
17 changes: 9 additions & 8 deletions include/oneapi/dpl/pstl/parallel_backend_tbb.h
Original file line number Diff line number Diff line change
Expand Up @@ -1307,11 +1307,12 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy
tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); });
}

template <typename _ValueType>
template <typename _StorageType>
struct __thread_enumerable_storage
{
__thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value)
: __thread_specific_storage(__num_bins, __init_value)
template <typename... Args>
__thread_enumerable_storage(Args&&... args)
: __thread_specific_storage(std::forward<Args>(args)...)
{
}

Expand All @@ -1321,19 +1322,19 @@ struct __thread_enumerable_storage
return __thread_specific_storage.size();
}

auto
_StorageType&
get()
{
return __thread_specific_storage.local().begin();
return __thread_specific_storage.local();
}

auto
_StorageType&
get_with_id(std::size_t __i)
{
return __thread_specific_storage.begin()[__i].begin();
return __thread_specific_storage.begin()[__i];
}

tbb::enumerable_thread_specific<std::vector<_ValueType>> __thread_specific_storage;
tbb::enumerable_thread_specific<_StorageType> __thread_specific_storage;
};

} // namespace __tbb_backend
Expand Down

0 comments on commit 6b5019d

Please sign in to comment.