Skip to content

Commit

Permalink
Reduce code duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
HolyWu committed Jul 5, 2020
1 parent fff2c99 commit d0df8c2
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 377 deletions.
4 changes: 2 additions & 2 deletions CAS/CAS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ static void VS_CC casCreate(const VSMap * in, VSMap * out, void * userData, VSCo

for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->vi->width >> (plane ? d->vi->format->subSamplingW : 0) < 3)
throw "every plane's width must be greater than or equal to 3";
throw "plane's width must be greater than or equal to 3";

if (d->vi->height >> (plane ? d->vi->format->subSamplingH : 0) < 3)
throw "every plane's height must be greater than or equal to 3";
throw "plane's height must be greater than or equal to 3";
}

d->sharpness = static_cast<float>(vsapi->propGetFloat(in, "sharpness", 0, &err));
Expand Down
193 changes: 68 additions & 125 deletions CAS/CAS_AVX2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,45 @@

template<typename pixel_t>
void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
auto load_8u = [](const void * srcp) noexcept {
using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
using vec_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec8i, Vec8f>;

const vec_t limit = std::any_cast<var_t>(data->limit);

auto load = [](const pixel_t * srcp) noexcept {
if constexpr (std::is_same_v<pixel_t, uint8_t>)
return Vec8i().load_8uc(srcp);
return vec_t().load_8uc(srcp);
else if constexpr (std::is_same_v<pixel_t, uint16_t>)
return vec_t().load_8us(srcp);
else
return Vec8i().load_8us(srcp);
return vec_t().load(srcp);
};

auto store_8u = [&](const Vec8f __result, void * dstp) noexcept {
const Vec8i _result = truncatei(__result + 0.5f);

auto store = [&](const Vec8f srcp, pixel_t * dstp) noexcept {
if constexpr (std::is_same_v<pixel_t, uint8_t>) {
const auto result = compress_saturated_s2u(compress_saturated(_result, zero_si256()), zero_si256()).get_low();
const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si256()), zero_si256()).get_low();
result.storel(dstp);
} else {
const auto result = compress_saturated_s2u(_result, zero_si256()).get_low();
} else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si256()).get_low();
min(result, data->peak).store_nt(dstp);
} else {
srcp.store_nt(dstp);
}
};

using var_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec8i, Vec8f>;

const var_t limit = std::any_cast<std::conditional_t<std::is_integral_v<pixel_t>, int, float>>(data->limit);

auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i,
auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i,
const Vec8f chromaOffset) noexcept {
// Soft min and max.
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// These are 2.0x bigger (factored out the extra multiply).
var_t mn = min(min(min(d, e), min(f, b)), h);
const var_t mn2 = min(min(min(mn, a), min(c, g)), i);
vec_t mn = min(min(min(d, e), min(f, b)), h);
const vec_t mn2 = min(min(min(mn, a), min(c, g)), i);
mn += mn2;

var_t mx = max(max(max(d, e), max(f, b)), h);
const var_t mx2 = max(max(max(mx, a), max(c, g)), i);
vec_t mx = max(max(max(d, e), max(f, b)), h);
const vec_t mx2 = max(max(max(mx, a), max(c, g)), i);
mx += mx2;

if constexpr (std::is_floating_point_v<pixel_t>) {
Expand Down Expand Up @@ -77,128 +80,68 @@ void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const

const Vec8f chromaOffset = plane ? 1.0f : 0.0f;

const int regularPart = (width - 1) & ~(Vec8i().size() - 1);
const int regularPart = (width - 1) & ~(vec_t().size() - 1);

for (int y = 0; y < height; y++) {
const pixel_t * above = srcp + (y == 0 ? stride : -stride);
const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);

if constexpr (std::is_integral_v<pixel_t>) {
{
const Vec8i b = load_8u(above + 0);
const Vec8i e = load_8u(srcp + 0);
const Vec8i h = load_8u(below + 0);

const Vec8i a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b);
const Vec8i d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e);
const Vec8i g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h);

Vec8i c, f, i;
if (width > Vec8i().size()) {
c = load_8u(above + 1);
f = load_8u(srcp + 1);
i = load_8u(below + 1);
} else {
c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
}

const Vec8f result = filtering(a, b, c,
d, e, f,
g, h, i,
chromaOffset);

store_8u(result, dstp + 0);
}

for (int x = Vec8i().size(); x < regularPart; x += Vec8i().size()) {
const Vec8f result = filtering(load_8u(above + x - 1), load_8u(above + x), load_8u(above + x + 1),
load_8u(srcp + x - 1), load_8u(srcp + x), load_8u(srcp + x + 1),
load_8u(below + x - 1), load_8u(below + x), load_8u(below + x + 1),
chromaOffset);

store_8u(result, dstp + x);
{
const vec_t b = load(above + 0);
const vec_t e = load(srcp + 0);
const vec_t h = load(below + 0);

const vec_t a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b);
const vec_t d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e);
const vec_t g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h);

vec_t c, f, i;
if (width > vec_t().size()) {
c = load(above + 1);
f = load(srcp + 1);
i = load(below + 1);
} else {
c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
}

if (regularPart >= Vec8i().size()) {
const Vec8i a = load_8u(above + regularPart - 1);
const Vec8i d = load_8u(srcp + regularPart - 1);
const Vec8i g = load_8u(below + regularPart - 1);
const Vec8f result = filtering(a, b, c,
d, e, f,
g, h, i,
chromaOffset);

const Vec8i b = load_8u(above + regularPart);
const Vec8i e = load_8u(srcp + regularPart);
const Vec8i h = load_8u(below + regularPart);

const Vec8i c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
const Vec8i f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
const Vec8i i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);

const Vec8f result = filtering(a, b, c,
d, e, f,
g, h, i,
chromaOffset);

store_8u(result, dstp + regularPart);
}
} else {
{
const Vec8f b = Vec8f().load_a(above + 0);
const Vec8f e = Vec8f().load_a(srcp + 0);
const Vec8f h = Vec8f().load_a(below + 0);

const Vec8f a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b);
const Vec8f d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e);
const Vec8f g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h);

Vec8f c, f, i;
if (width > Vec8f().size()) {
c = Vec8f().load(above + 1);
f = Vec8f().load(srcp + 1);
i = Vec8f().load(below + 1);
} else {
c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
}

const Vec8f result = filtering(a, b, c,
d, e, f,
g, h, i,
chromaOffset);

result.store_nt(dstp + 0);
}
store(result, dstp + 0);
}

for (int x = Vec8f().size(); x < regularPart; x += Vec8f().size()) {
const Vec8f result = filtering(Vec8f().load(above + x - 1), Vec8f().load_a(above + x), Vec8f().load(above + x + 1),
Vec8f().load(srcp + x - 1), Vec8f().load_a(srcp + x), Vec8f().load(srcp + x + 1),
Vec8f().load(below + x - 1), Vec8f().load_a(below + x), Vec8f().load(below + x + 1),
chromaOffset);
for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) {
const Vec8f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1),
load(srcp + x - 1), load(srcp + x), load(srcp + x + 1),
load(below + x - 1), load(below + x), load(below + x + 1),
chromaOffset);

result.store_nt(dstp + x);
}
store(result, dstp + x);
}

if (regularPart >= Vec8f().size()) {
const Vec8f a = Vec8f().load(above + regularPart - 1);
const Vec8f d = Vec8f().load(srcp + regularPart - 1);
const Vec8f g = Vec8f().load(below + regularPart - 1);
if (regularPart >= vec_t().size()) {
const vec_t a = load(above + regularPart - 1);
const vec_t d = load(srcp + regularPart - 1);
const vec_t g = load(below + regularPart - 1);

const Vec8f b = Vec8f().load_a(above + regularPart);
const Vec8f e = Vec8f().load_a(srcp + regularPart);
const Vec8f h = Vec8f().load_a(below + regularPart);
const vec_t b = load(above + regularPart);
const vec_t e = load(srcp + regularPart);
const vec_t h = load(below + regularPart);

const Vec8f c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
const Vec8f f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
const Vec8f i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
const vec_t c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
const vec_t f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
const vec_t i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);

const Vec8f result = filtering(a, b, c,
d, e, f,
g, h, i,
chromaOffset);
const Vec8f result = filtering(a, b, c,
d, e, f,
g, h, i,
chromaOffset);

result.store_nt(dstp + regularPart);
}
store(result, dstp + regularPart);
}

srcp += stride;
Expand Down
Loading

0 comments on commit d0df8c2

Please sign in to comment.