Skip to content

Commit

Permalink
Optimize ranges::copy_backward for vector<bool>::iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
winner245 committed Jan 21, 2025
1 parent 3418cd0 commit d3bd7df
Show file tree
Hide file tree
Showing 8 changed files with 322 additions and 198 deletions.
3 changes: 3 additions & 0 deletions libcxx/docs/ReleaseNotes/20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ Improvements and New Features
std::errc::not_a_directory``, or use ``err.default_error_condition()`` to map to an ``error_condition``, and then test
its ``value()`` and ``category()``.

- The ``std::ranges::copy_backward`` algorithm has been optimized for ``std::vector<bool>::iterator``\s, resulting in
a performance improvement of up to 2000x.

Deprecations and Removals
-------------------------

Expand Down
131 changes: 131 additions & 0 deletions libcxx/include/__algorithm/copy_backward.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
#define _LIBCPP___ALGORITHM_COPY_BACKWARD_H

#include <__algorithm/copy_move_common.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/iterator_operations.h>
#include <__algorithm/min.h>
#include <__config>
#include <__fwd/bit_reference.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__memory/pointer_traits.h>
#include <__type_traits/common_type.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/is_constructible.h>
Expand All @@ -34,6 +37,124 @@ template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
__copy_backward(_InIter __first, _Sent __last, _OutIter __result);

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
__storage_type __b = *__last.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ == 0 || __n == 0
// do middle words
__storage_type __nw = __n / __bits_per_word;
__result.__seg_ -= __nw;
__last.__seg_ -= __nw;
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
__n -= __nw * __bits_per_word;
// do last word
if (__n > 0) {
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
*--__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
}
}
return __result;
}

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz_l = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
__storage_type __b = *__last.__seg_ & __m;
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
if (__ddn > 0) {
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
if (__result.__ctz_ > __last.__ctz_)
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
else
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__dn -= __ddn;
}
if (__dn > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
__last.__ctz_ -= __dn + __ddn;
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
}
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ != 0 || __n == 0
// do middle words
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __m = ~__storage_type(0) >> __clz_r;
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
__storage_type __b = *--__last.__seg_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> __clz_r;
*--__result.__seg_ &= __m;
*__result.__seg_ |= __b << __result.__ctz_;
}
// do last word
if (__n > 0) {
__m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
__clz_r = __bits_per_word - __result.__ctz_;
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__n -= __dn;
if (__n > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
}
}
}
return __result;
}

template <class _AlgPolicy>
struct __copy_backward_impl {
template <class _InIter, class _Sent, class _OutIter>
Expand Down Expand Up @@ -107,6 +228,16 @@ struct __copy_backward_impl {
}
}

template <class _Cp, bool _IsConst>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
operator()(__bit_iterator<_Cp, _IsConst> __first,
__bit_iterator<_Cp, _IsConst> __last,
__bit_iterator<_Cp, false> __result) {
if (__last.__ctz_ == __result.__ctz_)
return std::make_pair(__last, std::__copy_backward_aligned(__first, __last, __result));
return std::make_pair(__last, std::__copy_backward_unaligned(__first, __last, __result));
}

// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
Expand Down
136 changes: 5 additions & 131 deletions libcxx/include/__bit_reference
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#ifndef _LIBCPP___BIT_REFERENCE
#define _LIBCPP___BIT_REFERENCE

#include <__algorithm/copy_backward.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/min.h>
#include <__bit/countr.h>
Expand Down Expand Up @@ -307,134 +308,6 @@ copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last
return std::__copy_unaligned(__first, __last, __result);
}

// copy_backward

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
__storage_type __b = *__last.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ == 0 || __n == 0
// do middle words
__storage_type __nw = __n / __bits_per_word;
__result.__seg_ -= __nw;
__last.__seg_ -= __nw;
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
__n -= __nw * __bits_per_word;
// do last word
if (__n > 0) {
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
*--__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
}
}
return __result;
}

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz_l = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
__storage_type __b = *__last.__seg_ & __m;
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
if (__ddn > 0) {
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
if (__result.__ctz_ > __last.__ctz_)
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
else
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__dn -= __ddn;
}
if (__dn > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
__last.__ctz_ -= __dn + __ddn;
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
}
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ != 0 || __n == 0
// do middle words
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __m = ~__storage_type(0) >> __clz_r;
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
__storage_type __b = *--__last.__seg_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> __clz_r;
*--__result.__seg_ &= __m;
*__result.__seg_ |= __b << __result.__ctz_;
}
// do last word
if (__n > 0) {
__m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
__clz_r = __bits_per_word - __result.__ctz_;
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__n -= __dn;
if (__n > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
}
}
}
return __result;
}

template <class _Cp, bool _IsConst>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
if (__last.__ctz_ == __result.__ctz_)
return std::__copy_backward_aligned(__first, __last, __result);
return std::__copy_backward_unaligned(__first, __last, __result);
}

// move

template <class _Cp, bool _IsConst>
Expand Down Expand Up @@ -997,9 +870,10 @@ private:
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
// Note: dependent nested name specifier __copy_backward_impl<_AlgPolicy>::operator() for friend declaration
// is not supported in clang. Thus, we use a friend declaration for the entire class.
template <class _AlgPolicy>
friend struct __copy_backward_impl;
template <class _Cl, class _Cr>
friend __bit_iterator<_Cr, false>
__swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/__vector/vector_bool.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define _LIBCPP___VECTOR_VECTOR_BOOL_H

#include <__algorithm/copy.h>
#include <__algorithm/copy_backward.h>
#include <__algorithm/fill_n.h>
#include <__algorithm/iterator_operations.h>
#include <__algorithm/max.h>
Expand Down
2 changes: 2 additions & 0 deletions libcxx/include/bitset
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ template <size_t N> struct hash<std::bitset<N>>;
#if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
# include <__cxx03/bitset>
#else
# include <__algorithm/copy.h>
# include <__algorithm/copy_backward.h>
# include <__algorithm/count.h>
# include <__algorithm/fill.h>
# include <__algorithm/fill_n.h>
Expand Down
55 changes: 55 additions & 0 deletions libcxx/test/benchmarks/algorithms/copy_backward.bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20

#include <algorithm>
#include <benchmark/benchmark.h>
#include <vector>

static void bm_ranges_copy_backward_vb(benchmark::State& state, bool aligned) {
auto n = state.range();
std::vector<bool> in(n, true);
std::vector<bool> out(aligned ? n : n + 8);
benchmark::DoNotOptimize(&in);
auto dst = aligned ? out.end() : out.end() - 4;
for (auto _ : state) {
benchmark::DoNotOptimize(std::ranges::copy_backward(in, dst));
benchmark::DoNotOptimize(&out);
}
}

static void bm_copy_backward_vb(benchmark::State& state, bool aligned) {
auto n = state.range();
std::vector<bool> in(n, true);
std::vector<bool> out(aligned ? n : n + 8);
benchmark::DoNotOptimize(&in);
auto beg = in.begin();
auto end = in.end();
auto dst = aligned ? out.end() : out.end() - 4;
for (auto _ : state) {
benchmark::DoNotOptimize(std::copy_backward(beg, end, dst));
benchmark::DoNotOptimize(&out);
}
}

static void bm_ranges_copy_backward_vb_aligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, true); }
static void bm_ranges_copy_backward_vb_unaligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, false); }

static void bm_copy_backward_vb_aligned(benchmark::State& state) { bm_copy_backward_vb(state, true); }
static void bm_copy_backward_vb_unaligned(benchmark::State& state) { bm_copy_backward_vb(state, false); }

// Test std::ranges::copy_backward for vector<bool>::iterator
BENCHMARK(bm_ranges_copy_backward_vb_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
BENCHMARK(bm_ranges_copy_backward_vb_unaligned)->Range(8, 1 << 20);

// Test std::copy_backward for vector<bool>::iterator
BENCHMARK(bm_copy_backward_vb_aligned)->Range(8, 1 << 20);
BENCHMARK(bm_copy_backward_vb_unaligned)->Range(8, 1 << 20);

BENCHMARK_MAIN();
Loading

0 comments on commit d3bd7df

Please sign in to comment.