Skip to content

Commit

Permalink
fn filter_plane_{cols,rows}_{y,uv}: Refactor, cleanup, and optimize (
Browse files Browse the repository at this point in the history
…#1193)

This also helps prepare for making `fn loopfilter_sb`'s `lvl` slice
safe.
  • Loading branch information
kkysen authored Jun 12, 2024
2 parents 64ed8a2 + 97b49fd commit c3a6f07
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 171 deletions.
232 changes: 99 additions & 133 deletions src/lf_apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ use crate::src::lr_apply::LR_RESTORE_U;
use crate::src::lr_apply::LR_RESTORE_V;
use crate::src::lr_apply::LR_RESTORE_Y;
use crate::src::relaxed_atomic::RelaxedAtomic;
use crate::src::unstable_extensions::as_chunks;
use crate::src::unstable_extensions::flatten;
use libc::ptrdiff_t;
use std::array;
use std::cmp;
Expand Down Expand Up @@ -342,55 +340,39 @@ pub(crate) unsafe fn rav1d_copy_lpf<BD: BitDepth>(
}
}

/// Slice `[u8; 4]`s from `lvl`, but "unaligned",
/// meaning the `[u8; 4]`s can straddle
/// adjacent `[u8; 4]`s in the `lvl` slice.
///
/// Note that this does not result in actual unaligned reads,
/// since `[u8; 4]` has an alignment of 1.
/// This optimizes to a single slice with a bounds check.
#[inline(always)]
fn unaligned_lvl_slice(lvl: &[[u8; 4]], y: usize) -> &[[u8; 4]] {
as_chunks(&flatten(lvl)[y..]).0
}

#[inline]
unsafe fn filter_plane_cols_y<BD: BitDepth>(
f: &Rav1dFrameData,
have_left: bool,
lvl: &[[u8; 4]],
mask: &[[[RelaxedAtomic<u16>; 2]; 3]; 32],
y_dst: Rav1dPictureDataComponentOffset,
w: c_int,
starty4: c_int,
endy4: c_int,
w: usize,
starty4: usize,
endy4: usize,
) {
// filter edges between columns (e.g. block1 | block2)
for x in 0..w as usize {
if !(!have_left && x == 0) {
let mut hmask: [u32; 3] = [0; 3];
if starty4 == 0 {
hmask[0] = mask[x][0][0].get() as u32;
hmask[1] = mask[x][1][0].get() as u32;
hmask[2] = mask[x][2][0].get() as u32;
if endy4 > 16 {
hmask[0] |= (mask[x][0][1].get() as u32) << 16;
hmask[1] |= (mask[x][1][1].get() as u32) << 16;
hmask[2] |= (mask[x][2][1].get() as u32) << 16;
}
let lf_sb = &f.dsp.lf.loop_filter_sb;
let len = endy4 - starty4;
let mask = &mask[..w];
for x in 0..w {
if !have_left && x == 0 {
continue;
}
let mask = &mask[x];
let hmask = if starty4 == 0 {
if endy4 > 16 {
mask.each_ref()
.map(|[a, b]| a.get() as u32 | ((b.get() as u32) << 16))
} else {
hmask[0] = mask[x][0][1].get() as u32;
hmask[1] = mask[x][1][1].get() as u32;
hmask[2] = mask[x][2][1].get() as u32;
mask.each_ref().map(|[a, _]| a.get() as u32)
}
f.dsp.lf.loop_filter_sb[0][0].call::<BD>(
f,
y_dst + x * 4,
&hmask,
&lvl[x..],
endy4 - starty4,
);
}
} else {
mask.each_ref().map(|[_, b]| b.get() as u32)
};
let y_dst = y_dst + x * 4;
let lvl = &lvl[x..];
lf_sb.y.h.call::<BD>(f, y_dst, &hmask, lvl, 0, len);
}
}

Expand All @@ -401,30 +383,29 @@ unsafe fn filter_plane_rows_y<BD: BitDepth>(
lvl: &[[u8; 4]],
b4_stride: usize,
mask: &[[[RelaxedAtomic<u16>; 2]; 3]; 32],
mut y_dst: Rav1dPictureDataComponentOffset,
w: c_int,
starty4: c_int,
endy4: c_int,
y_dst: Rav1dPictureDataComponentOffset,
w: usize,
starty4: usize,
endy4: usize,
) {
// block1
// filter edges between rows (e.g. ------)
// block2
for (y, lvl) in (starty4..endy4).zip(lvl.chunks(b4_stride as usize)) {
if !(!have_top && y == 0) {
let vmask = [
mask[y as usize][0][0].get() as u32 | (mask[y as usize][0][1].get() as u32) << 16,
mask[y as usize][1][0].get() as u32 | (mask[y as usize][1][1].get() as u32) << 16,
mask[y as usize][2][0].get() as u32 | (mask[y as usize][2][1].get() as u32) << 16,
];
f.dsp.lf.loop_filter_sb[0][1].call::<BD>(
f,
y_dst,
&vmask,
unaligned_lvl_slice(&lvl[0..], 1),
w,
);
let lf_sb = &f.dsp.lf.loop_filter_sb;
let len = endy4 - starty4;
let lvl = &lvl[..1 + (len - 1) * b4_stride];
for i in 0..len {
let y = i + starty4;
let y_dst = y_dst + (i as isize * 4 * y_dst.data.pixel_stride::<BD>());
if !have_top && y == 0 {
continue;
}
y_dst += 4 * y_dst.data.pixel_stride::<BD>();
let mask = &mask[y];
let vmask = mask
.each_ref()
.map(|[a, b]| a.get() as u32 | ((b.get() as u32) << 16));
let lvl = &lvl[i * b4_stride..];
lf_sb.y.v.call::<BD>(f, y_dst, &vmask, lvl, 1, w);
}
}

Expand All @@ -436,42 +417,37 @@ unsafe fn filter_plane_cols_uv<BD: BitDepth>(
mask: &[[[RelaxedAtomic<u16>; 2]; 2]; 32],
u_dst: Rav1dPictureDataComponentOffset,
v_dst: Rav1dPictureDataComponentOffset,
w: c_int,
starty4: c_int,
endy4: c_int,
w: usize,
starty4: usize,
endy4: usize,
ss_ver: c_int,
) {
// filter edges between columns (e.g. block1 | block2)
for x in 0..w as usize {
if !(!have_left && x == 0) {
let mut hmask: [u32; 3] = [0; 3];
if starty4 == 0 {
hmask[0] = mask[x as usize][0][0].get() as u32;
hmask[1] = mask[x as usize][1][0].get() as u32;
if endy4 > 16 >> ss_ver {
hmask[0] |= (mask[x as usize][0][1].get() as u32) << (16 >> ss_ver);
hmask[1] |= (mask[x as usize][1][1].get() as u32) << (16 >> ss_ver);
}
let lf_sb = &f.dsp.lf.loop_filter_sb;
let len = endy4 - starty4;
let mask = &mask[..w];
let lvl = &lvl[..w];
for x in 0..w {
if !have_left && x == 0 {
continue;
}
let u_dst = u_dst + x * 4;
let v_dst = v_dst + x * 4;
let mask = &mask[x];
let hmask = if starty4 == 0 {
if endy4 > 16 >> ss_ver {
mask.each_ref()
.map(|[a, b]| a.get() as u32 | ((b.get() as u32) << (16 >> ss_ver)))
} else {
hmask[0] = mask[x as usize][0][1].get() as u32;
hmask[1] = mask[x as usize][1][1].get() as u32;
mask.each_ref().map(|[a, _]| a.get() as u32)
}
// hmask[2] = 0; Already initialized to 0 above
f.dsp.lf.loop_filter_sb[1][0].call::<BD>(
f,
u_dst + x * 4,
&hmask,
unaligned_lvl_slice(&lvl[x as usize..], 2),
endy4 - starty4,
);
f.dsp.lf.loop_filter_sb[1][0].call::<BD>(
f,
v_dst + x * 4,
&hmask,
unaligned_lvl_slice(&lvl[x as usize..], 3),
endy4 - starty4,
);
}
} else {
mask.each_ref().map(|[_, b]| b.get() as u32)
};
let hmask = [hmask[0], hmask[1], 0];
let lvl = &lvl[x..];
lf_sb.uv.h.call::<BD>(f, u_dst, &hmask, lvl, 2, len);
lf_sb.uv.h.call::<BD>(f, v_dst, &hmask, lvl, 3, len);
}
}

Expand All @@ -482,43 +458,33 @@ unsafe fn filter_plane_rows_uv<BD: BitDepth>(
lvl: &[[u8; 4]],
b4_stride: usize,
mask: &[[[RelaxedAtomic<u16>; 2]; 2]; 32],
mut u_dst: Rav1dPictureDataComponentOffset,
mut v_dst: Rav1dPictureDataComponentOffset,
w: c_int,
starty4: c_int,
endy4: c_int,
u_dst: Rav1dPictureDataComponentOffset,
v_dst: Rav1dPictureDataComponentOffset,
w: usize,
starty4: usize,
endy4: usize,
ss_hor: c_int,
) {
// block1
// filter edges between rows (e.g. ------)
// block2
for (y, lvl) in (starty4..endy4).zip(lvl.chunks(b4_stride as usize)) {
if !(!have_top && y == 0) {
let vmask: [u32; 3] = [
mask[y as usize][0][0].get() as u32
| (mask[y as usize][0][1].get() as u32) << (16 >> ss_hor),
mask[y as usize][1][0].get() as u32
| (mask[y as usize][1][1].get() as u32) << (16 >> ss_hor),
0,
];
f.dsp.lf.loop_filter_sb[1][1].call::<BD>(
f,
u_dst,
&vmask,
unaligned_lvl_slice(&lvl[0..], 2),
w,
);
f.dsp.lf.loop_filter_sb[1][1].call::<BD>(
f,
v_dst,
&vmask,
unaligned_lvl_slice(&lvl[0..], 3),
w,
);
let lf_sb = &f.dsp.lf.loop_filter_sb;
let len = endy4 - starty4;
let lvl = &lvl[..1 + (len - 1) * b4_stride];
for i in 0..len {
let y = i + starty4;
let u_dst = u_dst + (i as isize * 4 * u_dst.data.pixel_stride::<BD>());
let v_dst = v_dst + (i as isize * 4 * v_dst.data.pixel_stride::<BD>());
if !have_top && y == 0 {
continue;
}

u_dst += 4 * u_dst.data.pixel_stride::<BD>();
v_dst += 4 * v_dst.data.pixel_stride::<BD>();
let vmask = mask[y]
.each_ref()
.map(|[a, b]| a.get() as u32 | ((b.get() as u32) << (16 >> ss_hor)));
let vmask = [vmask[0], vmask[1], 0];
let lvl = &lvl[i * b4_stride..];
lf_sb.uv.v.call::<BD>(f, u_dst, &vmask, lvl, 2, w);
lf_sb.uv.v.call::<BD>(f, v_dst, &vmask, lvl, 3, w);
}
}

Expand Down Expand Up @@ -642,9 +608,9 @@ pub(crate) unsafe fn rav1d_loopfilter_sbrow_cols<BD: BitDepth>(
level_ptr,
&lflvl[x].filter_y[0],
py + x * 128,
cmp::min(32, f.w4 - x as c_int * 32),
starty4 as c_int,
endy4 as c_int,
cmp::min(32, f.w4 - x as c_int * 32) as usize,
starty4 as usize,
endy4 as usize,
);
have_left = true;
level_ptr = &level_ptr[32..];
Expand All @@ -665,9 +631,9 @@ pub(crate) unsafe fn rav1d_loopfilter_sbrow_cols<BD: BitDepth>(
&lflvl[x].filter_uv[0],
pu + x * (128 >> ss_hor),
pv + x * (128 >> ss_hor),
cmp::min(32, f.w4 - x as c_int * 32) + ss_hor >> ss_hor,
starty4 as c_int >> ss_ver,
uv_endy4 as c_int,
(cmp::min(32, f.w4 - x as c_int * 32) + ss_hor >> ss_hor) as usize,
starty4 as usize >> ss_ver,
uv_endy4 as usize,
ss_ver,
);
have_left = true;
Expand Down Expand Up @@ -706,9 +672,9 @@ pub(crate) unsafe fn rav1d_loopfilter_sbrow_rows<BD: BitDepth>(
f.b4_stride as usize,
&lflvl[x].filter_y[1],
p[0] + 128 * x,
cmp::min(32, f.w4 - x as c_int * 32),
starty4,
endy4 as c_int,
cmp::min(32, f.w4 - x as c_int * 32) as usize,
starty4 as usize,
endy4 as usize,
);
level_ptr = &level_ptr[32..];
}
Expand All @@ -732,9 +698,9 @@ pub(crate) unsafe fn rav1d_loopfilter_sbrow_rows<BD: BitDepth>(
&lflvl[x].filter_uv[1],
pu + (x * 128 >> ss_hor),
pv + (x * 128 >> ss_hor),
cmp::min(32 as c_int, f.w4 - x as c_int * 32) + ss_hor >> ss_hor,
starty4 >> ss_ver,
uv_endy4 as c_int,
(cmp::min(32 as c_int, f.w4 - x as c_int * 32) + ss_hor >> ss_hor) as usize,
starty4 as usize >> ss_ver,
uv_endy4 as usize,
ss_hor,
);
level_ptr = &level_ptr[32 >> ss_hor..];
Expand Down
Loading

0 comments on commit c3a6f07

Please sign in to comment.