Skip to content

Commit 78aa3f5

Browse files
authored
Merge pull request #1 from seiko2plus/impl-sincos
algo: implement sine/cosine based on Intel SVML for single/double precision
2 parents 62f49dc + e2e1b10 commit 78aa3f5

File tree

8 files changed

+1382
-0
lines changed

8 files changed

+1382
-0
lines changed

npsr/hwy.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#ifndef NPSR_HWY_H_
2+
#define NPSR_HWY_H_
3+
4+
#include <hwy/highway.h>
5+
// This macro is used to define intrinsics that are:
6+
// NOTE: equals to HWY_API.
7+
// - always inlined
8+
// - flattened (no separate stack frame)
9+
// - marked maybe unused to suppress warnings when they are not used
10+
// NOTE: we do not need to use HWY_ATTR because we wrap Highway intrinsics in
11+
// HWY_BEFORE_NAMESPACE()/HWY_AFTER_NAMESPACE()
12+
// which implies the nessessary target attributes via #pargma.
13+
#define NPSR_INTRIN static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
14+
#endif // NPSR_HWY_H_
15+
16+
#if defined(NPSR_HWY_FOREACH_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
17+
#ifdef NPSR_HWY_FOREACH_H_
18+
#undef NPSR_HWY_FOREACH_H_
19+
#else
20+
#define NPSR_HWY_FOREACH_H_
21+
#endif
22+
23+
HWY_BEFORE_NAMESPACE();
24+
namespace npsr::HWY_NAMESPACE {
25+
namespace hn = hwy::HWY_NAMESPACE;
26+
using hn::DFromV;
27+
using hn::MFromD;
28+
using hn::Rebind;
29+
using hn::RebindToUnsigned;
30+
using hn::TFromD;
31+
using hn::TFromV;
32+
using hn::VFromD;
33+
constexpr bool kNativeFMA = HWY_NATIVE_FMA != 0;
34+
35+
inline HWY_ATTR void DummyToSuppressUnusedWarning() {}
36+
} // namespace npsr::HWY_NAMESPACE
37+
HWY_AFTER_NAMESPACE();
38+
39+
#endif // NPSR_HWY_FOREACH_H_

npsr/lut-inl.h

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#if defined(NPSR_LUT_INL_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
2+
#ifdef NPSR_LUT_INL_H_
3+
#undef NPSR_LUT_INL_H_
4+
#else
5+
#define NPSR_LUT_INL_H_
6+
#endif
7+
8+
#include <tuple>
9+
10+
#include "npsr/hwy.h"
11+
12+
HWY_BEFORE_NAMESPACE();
13+
14+
namespace npsr::HWY_NAMESPACE {
15+
16+
/**
17+
* @brief SIMD-optimized lookup table implementation
18+
*
19+
* This class provides an efficient lookup table.
20+
* It stores data in both row-major and column-major
21+
* formats to optimize different access patterns.
22+
*
23+
* @tparam T Element type (must match SIMD vector element type)
24+
* @tparam kRows Number of rows in the lookup table
25+
* @tparam kCols Number of columns in the lookup table
26+
*
27+
* Example usage:
28+
* @code
29+
* // Create a 2x4 lookup table
30+
* constexpr Lut lut{{1.0f, 2.0f, 3.0f, 4.0f}, {5.0f, 6.0f, 7.0f, 8.0f}};
31+
* // Load values using SIMD indices
32+
* auto indices = Set(d, 2); // SIMD vector of indices
33+
* Vec<D> out0, out1;
34+
* lut.Load(indices, out0, out1);
35+
* @endcode
36+
*/
37+
template <typename T, size_t kRows, size_t kCols>
38+
class Lut {
39+
public:
40+
static constexpr size_t kLength = kRows * kCols;
41+
42+
/**
43+
* @brief Construct a lookup table from row arrays
44+
*
45+
* @tparam ColSizes Size of each row array (deduced)
46+
* @param rows Variable number of arrays, each representing a row
47+
*
48+
* @note All rows must have exactly kCols elements
49+
* @note The constructor is constexpr for compile-time initialization
50+
*/
51+
template <size_t... ColSizes>
52+
constexpr Lut(const T (&...rows)[ColSizes]) : row_{} {
53+
// Check that we have the right number of rows
54+
static_assert(sizeof...(rows) == kRows,
55+
"Number of rows doesn't match template parameter");
56+
// Check that all rows have the same number of columns
57+
static_assert(((ColSizes == kCols) && ...),
58+
"All rows must have the same number of columns");
59+
60+
// Copy data using recursive template approach
61+
ToRowMajor_<0>(rows...);
62+
}
63+
64+
/**
65+
* @brief Load values from the LUT using SIMD indices
66+
*
67+
* This method performs efficient SIMD lookups by selecting the optimal
68+
* implementation based on the vector size and LUT dimensions.
69+
*
70+
* @tparam VU SIMD vector type for indices
71+
* @tparam OutV Output SIMD vector types (must match number of rows)
72+
* @param idx SIMD vector of column indices
73+
* @param out Output vectors (one per row)
74+
*
75+
* @note The number of output vectors must exactly match kRows
76+
* @note Index values must be in range [0, kCols)
77+
*/
78+
template <typename VU, typename... OutV>
79+
HWY_INLINE void Load(VU idx, OutV &...out) const {
80+
static_assert(sizeof...(OutV) == kRows,
81+
"Number of output vectors must match number of rows in LUT");
82+
using namespace hn;
83+
using TU = TFromV<VU>;
84+
static_assert(sizeof(TU) == sizeof(T),
85+
"Index type must match LUT element type");
86+
// Row-major based optimization
87+
LoadRow_(idx, out...);
88+
}
89+
90+
private:
91+
/// Convert input rows to row-major storage format
92+
template <size_t RowIDX, size_t... ColSizes>
93+
constexpr void ToRowMajor_(const T (&...rows)[ColSizes]) {
94+
if constexpr (RowIDX < kRows) {
95+
auto row_array = std::get<RowIDX>(std::make_tuple(rows...));
96+
for (size_t col = 0; col < kCols; ++col) {
97+
row_[RowIDX * kCols + col] = row_array[col];
98+
}
99+
ToRowMajor_<RowIDX + 1>(rows...);
100+
}
101+
}
102+
103+
/// Dispatch to optimal row-load implementation based on vector/LUT size
104+
template <size_t Off = 0, typename VU, typename... OutV>
105+
HWY_INLINE void LoadRow_(VU idx, OutV &...out) const {
106+
using namespace hn;
107+
using DU = DFromV<VU>;
108+
const DU du;
109+
using D = Rebind<T, DU>;
110+
const D d;
111+
112+
HWY_LANES_CONSTEXPR size_t kLanes = Lanes(du);
113+
if HWY_LANES_CONSTEXPR (kLanes == kCols) {
114+
// Vector size matches table width - use single table lookup
115+
const auto ind = IndicesFromVec(d, idx);
116+
LoadX1_<Off>(ind, out...);
117+
} else if HWY_LANES_CONSTEXPR (kLanes * 2 == kCols) {
118+
// Vector size is half table width - use two table lookup
119+
const auto ind = IndicesFromVec(d, idx);
120+
LoadX2_<Off>(ind, out...);
121+
} else {
122+
// Fallback to gather for other configurations
123+
LoadGather_<Off>(idx, out...);
124+
}
125+
}
126+
127+
// Load using single table lookup (vector size == table width)
128+
template <size_t Off = 0, typename VInd, typename OutV0, typename... OutV>
129+
HWY_INLINE void LoadX1_(const VInd &ind, OutV0 &out0, OutV &...out) const {
130+
using namespace hn;
131+
using D = DFromV<OutV0>;
132+
const D d;
133+
134+
const OutV0 lut0 = LoadU(d, row_ + Off);
135+
out0 = TableLookupLanes(lut0, ind);
136+
137+
if constexpr (sizeof...(OutV) > 0) {
138+
LoadX1_<Off + kCols>(ind, out...);
139+
}
140+
}
141+
142+
// Load using two table lookups (vector size == table width / 2)
143+
template <size_t Off = 0, typename VInd, typename OutV0, typename... OutV>
144+
HWY_INLINE void LoadX2_(const VInd &ind, OutV0 &out0, OutV &...out) const {
145+
using namespace hn;
146+
using D = DFromV<OutV0>;
147+
const D d;
148+
149+
constexpr size_t kLanes = kCols / 2;
150+
const OutV0 lut0 = LoadU(d, row_ + Off);
151+
const OutV0 lut1 = LoadU(d, row_ + Off + kLanes);
152+
out0 = TwoTablesLookupLanes(d, lut0, lut1, ind);
153+
154+
if constexpr (sizeof...(OutV) > 0) {
155+
LoadX2_<Off + kCols>(ind, out...);
156+
}
157+
}
158+
159+
// General fallback using gather instructions
160+
template <size_t Off = 0, typename VU, typename OutV0, typename... OutV>
161+
HWY_INLINE void LoadGather_(const VU &idx, OutV0 &out0, OutV &...out) const {
162+
using namespace hn;
163+
using D = DFromV<OutV0>;
164+
const D d;
165+
out0 = GatherIndex(d, row_ + Off, BitCast(RebindToSigned<D>(), idx));
166+
if constexpr (sizeof...(OutV) > 0) {
167+
LoadGather_<Off + kCols>(idx, out...);
168+
}
169+
}
170+
171+
// Row-major
172+
HWY_ALIGN T row_[kLength];
173+
};
174+
175+
/**
176+
* @brief Deduction guide for automatic dimension detection
177+
*
178+
* Allows constructing a Lut without explicitly specifying dimensions:
179+
* @code
180+
* Lut lut{row0, row1, row2}; // Dimensions deduced from arrays
181+
* @endcode
182+
*/
183+
template <typename T, size_t First, size_t... Rest>
184+
Lut(const T (&first)[First], const T (&...rest)[Rest])
185+
-> Lut<T, 1 + sizeof...(Rest), First>;
186+
187+
/**
188+
* @brief Factory function that requires explicit type specification
189+
*
190+
* This approach forces users to specify the type T explicitly while
191+
* automatically deducing the dimensions from the array arguments.
192+
*
193+
* Note: We use MakeLut since partial deduction guides (e.g., Lut<float>{...})
194+
* require C++20, but this codebase targets C++17.
195+
*
196+
* @tparam T Element type (must be explicitly specified)
197+
* @param first First row array
198+
* @param rest Additional row arrays
199+
* @return Lut with deduced dimensions
200+
*
201+
* Usage:
202+
* @code
203+
* auto lut = MakeLut<float>(row0, row1, row2); // T explicit, dimensions
204+
* deduced
205+
* @endcode
206+
*/
207+
template <typename T, size_t First, size_t... Rest>
208+
constexpr auto MakeLut(const T (&first)[First], const T (&...rest)[Rest]) {
209+
return Lut<T, 1 + sizeof...(Rest), First>{first, rest...};
210+
}
211+
212+
} // namespace npsr::HWY_NAMESPACE
213+
214+
HWY_AFTER_NAMESPACE();
215+
216+
#endif // NPSR_LUT_INL_H_

npsr/npsr.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// To include them once per target, which is ensured by the toggle check.
2+
#if defined(NPSR_NPSR_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
3+
#ifdef NPSR_NPSR_H_
4+
#undef NPSR_NPSR_H_
5+
#else
6+
#define NPSR_NPSR_H_
7+
#endif
8+
9+
#include "npsr/trig/inl.h"
10+
11+
#endif // NPSR_NPSR_H_

0 commit comments

Comments
 (0)