Skip to content

Commit

Permalink
unpack16 using avx2 and restrict
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Apr 7, 2024
1 parent 8a53594 commit 293d697
Show file tree
Hide file tree
Showing 6 changed files with 374 additions and 23 deletions.
18 changes: 16 additions & 2 deletions cpp/src/arrow/util/bpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -457,14 +457,28 @@ int unpack16_default(const uint8_t* in, uint16_t* out, int batch_size, int num_b
}
return batch_size;
}

struct Unpack16DynamicFunction {
using FunctionType = decltype(&unpack16_default);

static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
return {{DispatchLevel::NONE, unpack16_default}
#if defined(ARROW_HAVE_RUNTIME_AVX2)
,
{DispatchLevel::AVX2, unpack16_avx2}
#endif
};
}
};

}

int unpack16(const uint8_t* in, uint16_t* out, int batch_size, int num_bits) {
// TODO: unpack16_neon, unpack16_avx2
#if defined(ARROW_HAVE_NEON)
return unpack16_neon(reinterpret_cast<const uint16_t*>(in), out, batch_size, num_bits);
#else
return unpack16_default(in, out, batch_size, num_bits);
static DynamicDispatch<Unpack16DynamicFunction> dispatch;
return dispatch.func(in, out, batch_size, num_bits);
#endif
}

Expand Down
34 changes: 17 additions & 17 deletions cpp/src/arrow/util/bpacking16_simd128_generated_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ struct Unpack16Bits128 {

using simd_batch = xsimd::make_sized_batch_t<uint16_t, 8>;

inline static const uint16_t* unpack0_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack0_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
memset(out, 0x0, 16 * sizeof(*out));
out += 16;

return in;
}

inline static const uint16_t* unpack1_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack1_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x1;

simd_batch masks(mask);
Expand All @@ -70,7 +70,7 @@ inline static const uint16_t* unpack1_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack2_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack2_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x3;

simd_batch masks(mask);
Expand All @@ -95,7 +95,7 @@ inline static const uint16_t* unpack2_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack3_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack3_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x7;

simd_batch masks(mask);
Expand All @@ -120,7 +120,7 @@ inline static const uint16_t* unpack3_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack4_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack4_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0xf;

simd_batch masks(mask);
Expand All @@ -145,7 +145,7 @@ inline static const uint16_t* unpack4_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack5_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack5_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x1f;

simd_batch masks(mask);
Expand All @@ -170,7 +170,7 @@ inline static const uint16_t* unpack5_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack6_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack6_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x3f;

simd_batch masks(mask);
Expand All @@ -195,7 +195,7 @@ inline static const uint16_t* unpack6_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack7_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack7_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x7f;

simd_batch masks(mask);
Expand All @@ -220,7 +220,7 @@ inline static const uint16_t* unpack7_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack8_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack8_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0xff;

simd_batch masks(mask);
Expand All @@ -245,7 +245,7 @@ inline static const uint16_t* unpack8_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack9_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack9_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x1ff;

simd_batch masks(mask);
Expand All @@ -270,7 +270,7 @@ inline static const uint16_t* unpack9_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack10_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack10_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x3ff;

simd_batch masks(mask);
Expand All @@ -295,7 +295,7 @@ inline static const uint16_t* unpack10_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack11_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack11_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x7ff;

simd_batch masks(mask);
Expand All @@ -320,7 +320,7 @@ inline static const uint16_t* unpack11_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack12_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack12_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0xfff;

simd_batch masks(mask);
Expand All @@ -345,7 +345,7 @@ inline static const uint16_t* unpack12_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack13_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack13_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x1fff;

simd_batch masks(mask);
Expand All @@ -370,7 +370,7 @@ inline static const uint16_t* unpack13_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack14_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack14_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x3fff;

simd_batch masks(mask);
Expand All @@ -395,7 +395,7 @@ inline static const uint16_t* unpack14_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack15_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack15_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
uint16_t mask = 0x7fff;

simd_batch masks(mask);
Expand All @@ -420,7 +420,7 @@ inline static const uint16_t* unpack15_16(const uint16_t* in, uint16_t* out) {
return in;
}

inline static const uint16_t* unpack16_16(const uint16_t* in, uint16_t* out) {
inline static const uint16_t* unpack16_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {
memcpy(out, in, 16 * sizeof(*out));
in += 16;
out += 16;
Expand Down
Loading

0 comments on commit 293d697

Please sign in to comment.