Skip to content

Commit

Permalink
Optimize FlatVector<T>::copyRanges (facebookincubator#6566)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#6566

FlatVector::copyRanges is slow when there are many small ranges. I noticed this while investigating slowness of array_constructor which used created 1-row ranges: facebookincubator#5958 (comment)

```
FlatVector.h

  void copyRanges(
      const BaseVector* source,
      const folly::Range<const BaseVector::CopyRange*>& ranges) override {
    for (auto& range : ranges) {
      copy(source, range.targetIndex, range.sourceIndex, range.count);
    }
  }
```

This change optimizes FlatVector<T>::copyRanges using code from copy(source, targetIndex, sourceIndex, count) which copies one range. This change also replaces the latter call with a call to copyRanges, hence, the overall amount of code is about the same.

An earlier change optimized array_constructor to not use copyRanges for primitive types, but it is still used by Array/MapVector::copyRanges, although, these do not create 1-row ranges. Still the array_constructor benchmark shows some wins for arrays and maps.

```
Before:

array_constructor_ARRAY_nullfree##2_const                  54.31ms     18.41
array_constructor_ARRAY_nulls##1                           33.50ms     29.85
array_constructor_ARRAY_nulls##2                           59.05ms     16.93
array_constructor_ARRAY_nulls#facebookincubator#3                           88.36ms     11.32
array_constructor_ARRAY_nulls##2_null                      74.53ms     13.42
array_constructor_ARRAY_nulls##2_const                    102.54ms      9.75

After:

array_constructor_ARRAY_nullfree##2_const                  41.36ms     24.18
array_constructor_ARRAY_nulls##1                           30.51ms     32.78
array_constructor_ARRAY_nulls##2                           55.13ms     18.14
array_constructor_ARRAY_nulls#facebookincubator#3                           77.93ms     12.83
array_constructor_ARRAY_nulls##2_null                      68.84ms     14.53
array_constructor_ARRAY_nulls##2_const                     83.91ms     11.92

Before:

array_constructor_MAP_nullfree##2_const                    67.44ms     14.83
array_constructor_MAP_nulls##1                             37.00ms     27.03
array_constructor_MAP_nulls##2                             67.76ms     14.76
array_constructor_MAP_nulls#facebookincubator#3                            100.88ms      9.91
array_constructor_MAP_nulls##2_null                        84.22ms     11.87
array_constructor_MAP_nulls##2_const                      122.55ms      8.16

After:

array_constructor_MAP_nullfree##2_const                    49.94ms     20.03
array_constructor_MAP_nulls##1                             34.34ms     29.12
array_constructor_MAP_nulls##2                             55.23ms     18.11
array_constructor_MAP_nulls#facebookincubator#3                             82.64ms     12.10
array_constructor_MAP_nulls##2_null                        70.74ms     14.14
array_constructor_MAP_nulls##2_const                       88.13ms     11.35
```

Reviewed By: laithsakka

Differential Revision: D49269500

fbshipit-source-id: 7b702921202f4bb8d10a252eb0ab20f0e5792ae6
  • Loading branch information
mbasmanova authored and facebook-github-bot committed Sep 19, 2023
1 parent 5c454f7 commit bd8d20b
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 209 deletions.
23 changes: 23 additions & 0 deletions velox/vector/BaseVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,29 @@ VectorPtr BaseVector::createInternal(
}
}

// static
void BaseVector::setNulls(
uint64_t* rawNulls,
const folly::Range<const CopyRange*>& ranges,
bool isNull) {
const auto nullBits = isNull ? bits::kNull : bits::kNotNull;
applyToEachRange(
ranges, [&](auto targetIndex, auto /*sourceIndex*/, auto count) {
bits::fillBits(rawNulls, targetIndex, targetIndex + count, nullBits);
});
}

// static
void BaseVector::copyNulls(
uint64_t* targetRawNulls,
const uint64_t* sourceRawNulls,
const folly::Range<const CopyRange*>& ranges) {
applyToEachRange(ranges, [&](auto targetIndex, auto sourceIndex, auto count) {
bits::copyBits(
sourceRawNulls, sourceIndex, targetRawNulls, targetIndex, count);
});
}

void BaseVector::addNulls(const uint64_t* bits, const SelectivityVector& rows) {
VELOX_CHECK(isNullsWritable());
VELOX_CHECK(length_ >= rows.end());
Expand Down
51 changes: 45 additions & 6 deletions velox/vector/BaseVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,25 @@ class BaseVector {
nulls_->asMutable<uint64_t>(), idx, bits::kNull ? value : !value);
}

struct CopyRange {
vector_size_t sourceIndex;
vector_size_t targetIndex;
vector_size_t count;
};

/// Sets null flags for each row in 'ranges' to 'isNull'.
static void setNulls(
uint64_t* rawNulls,
const folly::Range<const CopyRange*>& ranges,
bool isNull);

/// Copies null flags for each row in 'ranges' from 'sourceRawNulls' to
/// 'targetRawNulls'.
static void copyNulls(
uint64_t* targetRawNulls,
const uint64_t* sourceRawNulls,
const folly::Range<const CopyRange*>& ranges);

static int32_t
countNulls(const BufferPtr& nulls, vector_size_t begin, vector_size_t end) {
return nulls ? bits::countNulls(nulls->as<uint64_t>(), begin, end) : 0;
Expand Down Expand Up @@ -437,12 +456,6 @@ class BaseVector {
copyRanges(source, folly::Range(&range, 1));
}

struct CopyRange {
vector_size_t sourceIndex;
vector_size_t targetIndex;
vector_size_t count;
};

/// Converts SelectivityVetor into a list of CopyRanges having sourceIndex ==
/// targetIndex. Aims to produce as few ranges as possible. If all rows are
/// selected, returns a single range.
Expand Down Expand Up @@ -906,6 +919,32 @@ class BaseVector {
bool memoDisabled_{false};
};

/// Loops over rows in 'ranges' and invokes 'func' for each row.
/// @param TFunc A void function taking two arguments: targetIndex and
/// sourceIndex.
template <typename TFunc>
void applyToEachRow(
const folly::Range<const BaseVector::CopyRange*>& ranges,
const TFunc& func) {
for (const auto& range : ranges) {
for (auto i = 0; i < range.count; ++i) {
func(range.targetIndex + i, range.sourceIndex + i);
}
}
}

/// Loops over 'ranges' and invokes 'func' for each range.
/// @param TFunc A void function taking 3 arguments: targetIndex, sourceIndex
/// and count.
template <typename TFunc>
void applyToEachRange(
const folly::Range<const BaseVector::CopyRange*>& ranges,
const TFunc& func) {
for (const auto& range : ranges) {
func(range.targetIndex, range.sourceIndex, range.count);
}
}

template <>
uint64_t BaseVector::byteSize<bool>(vector_size_t count);

Expand Down
135 changes: 60 additions & 75 deletions velox/vector/ComplexVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,20 +299,17 @@ void RowVector::copyRanges(
}

if (isAllNullVector(*source)) {
for (const auto& range : ranges) {
for (auto i = 0; i < range.count; ++i) {
setNull(range.targetIndex + i, true);
}
}
BaseVector::setNulls(mutableRawNulls(), ranges, true);
return;
}

auto minTargetIndex = std::numeric_limits<vector_size_t>::max();
auto maxTargetIndex = std::numeric_limits<vector_size_t>::min();
for (auto& r : ranges) {
minTargetIndex = std::min(minTargetIndex, r.targetIndex);
maxTargetIndex = std::max(maxTargetIndex, r.targetIndex + r.count);
}
applyToEachRange(ranges, [&](auto targetIndex, auto sourceIndex, auto count) {
minTargetIndex = std::min(minTargetIndex, targetIndex);
maxTargetIndex = std::max(maxTargetIndex, targetIndex + count);
});

SelectivityVector rows(maxTargetIndex);
rows.setValidRange(0, minTargetIndex, false);
rows.updateBounds();
Expand All @@ -324,11 +321,7 @@ void RowVector::copyRanges(
DecodedVector decoded(*source);
if (decoded.isIdentityMapping() && !decoded.mayHaveNulls()) {
if (rawNulls_) {
auto* rawNulls = mutableRawNulls();
for (auto& r : ranges) {
bits::fillBits(
rawNulls, r.targetIndex, r.targetIndex + r.count, bits::kNotNull);
}
setNulls(mutableRawNulls(), ranges, false);
}
auto* rowSource = source->loadedVector()->as<RowVector>();
for (int i = 0; i < children_.size(); ++i) {
Expand All @@ -337,27 +330,26 @@ void RowVector::copyRanges(
} else {
std::vector<BaseVector::CopyRange> baseRanges;
baseRanges.reserve(ranges.size());
for (auto& r : ranges) {
for (vector_size_t i = 0; i < r.count; ++i) {
bool isNull = decoded.isNullAt(r.sourceIndex + i);
setNull(r.targetIndex + i, isNull);
if (isNull) {
continue;
}
auto baseIndex = decoded.index(r.sourceIndex + i);
if (!baseRanges.empty() &&
baseRanges.back().sourceIndex + 1 == baseIndex &&
baseRanges.back().targetIndex + 1 == r.targetIndex + i) {
++baseRanges.back().count;
} else {
baseRanges.push_back({
.sourceIndex = baseIndex,
.targetIndex = r.targetIndex + i,
.count = 1,
});
}
applyToEachRow(ranges, [&](auto targetIndex, auto sourceIndex) {
bool isNull = decoded.isNullAt(sourceIndex);
setNull(targetIndex, isNull);
if (isNull) {
return;
}
}
auto baseIndex = decoded.index(sourceIndex);
if (!baseRanges.empty() &&
baseRanges.back().sourceIndex + 1 == baseIndex &&
baseRanges.back().targetIndex + 1 == targetIndex) {
++baseRanges.back().count;
} else {
baseRanges.push_back({
.sourceIndex = baseIndex,
.targetIndex = targetIndex,
.count = 1,
});
}
});

auto* rowSource = decoded.base()->as<RowVector>();
for (int i = 0; i < children_.size(); ++i) {
children_[i]->copyRanges(
Expand Down Expand Up @@ -463,11 +455,7 @@ void ArrayVectorBase::copyRangesImpl(
VectorPtr* targetValues,
VectorPtr* targetKeys) {
if (isAllNullVector(*source)) {
for (const auto& range : ranges) {
for (auto i = 0; i < range.count; ++i) {
setNull(range.targetIndex + i, true);
}
}
BaseVector::setNulls(mutableRawNulls(), ranges, true);
return;
}

Expand Down Expand Up @@ -537,46 +525,43 @@ void ArrayVectorBase::copyRangesImpl(
} else {
std::vector<CopyRange> outRanges;
vector_size_t totalCount = 0;
for (auto& range : ranges) {
if (range.count == 0) {
continue;
}
VELOX_DCHECK(BaseVector::length_ >= range.targetIndex + range.count);
totalCount += range.count;
}
outRanges.reserve(totalCount);
for (auto& range : ranges) {
for (vector_size_t i = 0; i < range.count; ++i) {
if (source->isNullAt(range.sourceIndex + i)) {
setNull(range.targetIndex + i, true);
} else {
if (setNotNulls) {
setNull(range.targetIndex + i, false);
applyToEachRange(
ranges, [&](auto targetIndex, auto sourceIndex, auto count) {
if (count > 0) {
VELOX_DCHECK(BaseVector::length_ >= targetIndex + count);
totalCount += count;
}
vector_size_t wrappedIndex =
source->wrappedIndex(range.sourceIndex + i);
vector_size_t copySize = sourceArray->sizeAt(wrappedIndex);

if (copySize > 0) {
auto copyOffset = sourceArray->offsetAt(wrappedIndex);

// If we're copying two adjacent ranges, merge them. This only
// works if they're consecutive.
if (!outRanges.empty() &&
(outRanges.back().sourceIndex + outRanges.back().count ==
copyOffset)) {
outRanges.back().count += copySize;
} else {
outRanges.push_back({copyOffset, childSize, copySize});
}
});
outRanges.reserve(totalCount);
applyToEachRow(ranges, [&](auto targetIndex, auto sourceIndex) {
if (source->isNullAt(sourceIndex)) {
setNull(targetIndex, true);
} else {
if (setNotNulls) {
setNull(targetIndex, false);
}
auto wrappedIndex = source->wrappedIndex(sourceIndex);
auto copySize = sourceArray->sizeAt(wrappedIndex);

if (copySize > 0) {
auto copyOffset = sourceArray->offsetAt(wrappedIndex);

// If we're copying two adjacent ranges, merge them. This only
// works if they're consecutive.
if (!outRanges.empty() &&
(outRanges.back().sourceIndex + outRanges.back().count ==
copyOffset)) {
outRanges.back().count += copySize;
} else {
outRanges.push_back({copyOffset, childSize, copySize});
}

mutableOffsets[range.targetIndex + i] = childSize;
mutableSizes[range.targetIndex + i] = copySize;
childSize += copySize;
}

mutableOffsets[targetIndex] = childSize;
mutableSizes[targetIndex] = copySize;
childSize += copySize;
}
}
});

targetValues->get()->resize(childSize);
targetValues->get()->copyRanges(sourceValues, outRanges);
Expand Down
Loading

0 comments on commit bd8d20b

Please sign in to comment.