From b49ff1e82fbb7f686d1cbcaf331f2fe6219604c5 Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Tue, 11 Jun 2024 04:17:04 -0700 Subject: [PATCH 1/4] `fn filter_plane_{cols,rows}_{y,uv}`: Elide `mask` bounds checks by masking the index. --- src/lf_apply.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/lf_apply.rs b/src/lf_apply.rs index 364847f90..ead5d888d 100644 --- a/src/lf_apply.rs +++ b/src/lf_apply.rs @@ -360,7 +360,7 @@ unsafe fn filter_plane_cols_y( if !have_left && x == 0 { continue; } - let mask = &mask[x]; + let mask = &mask[x % mask.len()]; // To elide the bounds check.; let hmask = if starty4 == 0 { if endy4 > 16 { mask.each_ref() @@ -401,8 +401,7 @@ unsafe fn filter_plane_rows_y( if !have_top && y == 0 { continue; } - let mask = &mask[y]; - let vmask = mask + let vmask = mask[y % mask.len()] // To elide the bounds check. .each_ref() .map(|[a, b]| a.get() as u32 | ((b.get() as u32) << 16)); let lvl = &lvl[i * b4_stride..]; @@ -436,7 +435,7 @@ unsafe fn filter_plane_cols_uv( if !have_left && x == 0 { continue; } - let mask = &mask[x]; + let mask = &mask[x % mask.len()]; // To elide the bounds check.; let hmask = if starty4 == 0 { if endy4 > 16 >> ss_ver { mask.each_ref() @@ -483,7 +482,7 @@ unsafe fn filter_plane_rows_uv( if !have_top && y == 0 { continue; } - let vmask = mask[y] + let vmask = mask[y % mask.len()] // To elide the bounds check. .each_ref() .map(|[a, b]| a.get() as u32 | ((b.get() as u32) << (16 >> ss_hor))); let vmask = [vmask[0], vmask[1], 0]; From 3f980ef5891f92f23d6fbb458884e70bd72c1725 Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Wed, 12 Jun 2024 13:16:24 -0700 Subject: [PATCH 2/4] `fn rav1d_cdf_thread_copy`: Elide bounds checks by masking the index. --- src/cdf.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cdf.rs b/src/cdf.rs index b2cd45d3c..562f45c51 100644 --- a/src/cdf.rs +++ b/src/cdf.rs @@ -5097,10 +5097,11 @@ pub fn rav1d_cdf_thread_init_static(qidx: u8) -> CdfThreadContext { pub fn rav1d_cdf_thread_copy(src: &CdfThreadContext) -> CdfContext { match src { CdfThreadContext::Cdf(src) => src.cdf.try_read().unwrap().clone(), - CdfThreadContext::QCat(i) => CdfContext { + &CdfThreadContext::QCat(i) => CdfContext { m: Default::default(), kfym: default_kf_y_mode_cdf, - coef: av1_default_coef_cdf[*i as usize].clone(), + // `i` is the sum of 3 `bool`s + coef: av1_default_coef_cdf[i as usize & 3].clone(), mv: Default::default(), }, } From 0c98626ab85ef7eb47fb9eead891569de71c6521 Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Thu, 13 Jun 2024 10:30:36 -0700 Subject: [PATCH 3/4] `fn splat_dc`: Use `zerocopy` to make the slice casting safe. --- src/ipred.rs | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/ipred.rs b/src/ipred.rs index ad3970b23..23a4e9fa0 100644 --- a/src/ipred.rs +++ b/src/ipred.rs @@ -36,9 +36,10 @@ use libc::ptrdiff_t; use std::cmp; use std::ffi::c_int; use std::ffi::c_uint; -use std::mem; use std::slice; use strum::FromRepr; +use zerocopy::AsBytes; +use zerocopy::FromBytes; #[cfg(all( feature = "asm", @@ -214,7 +215,7 @@ pub struct Rav1dIntraPredDSPContext { #[inline(never)] unsafe fn splat_dc( - mut dst: *mut BD::Pixel, + dst: *mut BD::Pixel, stride: ptrdiff_t, width: c_int, height: c_int, @@ -222,38 +223,34 @@ unsafe fn splat_dc( bd: BD, ) { let stride = BD::pxstride(stride); + let height = height as isize; let width = width as usize; + assert!(dc <= bd.bitdepth_max().as_::()); + let dc = dc.as_::(); match BD::BPC { BPC::BPC8 => { - assert!(dc <= 0xff); if width > 4 { - let dcN = dc as u64 * 0x101010101010101; - for _ in 0..height { - let slice = - slice::from_raw_parts_mut(dst.cast::(), width / mem::size_of::()); - slice.fill(dcN); - dst = dst.offset(stride); + for y in 0..height { + let dst = dst.offset(y * stride); + let dst = slice::from_raw_parts_mut(dst, width); + let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); + dst.fill([dc; 8]); } } else { - let dcN = dc as u32 * 0x1010101; - for _ in 0..height { - let slice = - slice::from_raw_parts_mut(dst.cast::(), width / mem::size_of::()); - slice.fill(dcN); - dst = dst.offset(stride); + for y in 0..height { + let dst = dst.offset(y * stride); + let dst = slice::from_raw_parts_mut(dst, width); + let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); + dst.fill([dc; 4]); } }; } BPC::BPC16 => { - assert!(dc <= bd.bitdepth_max().as_::()); - let dcN = dc as u64 * 0x1000100010001; - for _ in 0..height { - let slice = slice::from_raw_parts_mut( - dst.cast::(), - width / (mem::size_of::() >> 1), - ); - slice.fill(dcN); - dst = dst.offset(stride); + for y in 0..height { + let dst = dst.offset(y as isize * stride); + let dst = slice::from_raw_parts_mut(dst, width); + let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); + dst.fill([dc; 4]); } } } From ea357e38333737f77b70ff50331a2ee6e1bfd41c Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Thu, 13 Jun 2024 10:31:51 -0700 Subject: [PATCH 4/4] `fn splat_dc`: Combine bitdepth versions, as the code is now written generically. --- src/ipred.rs | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/src/ipred.rs b/src/ipred.rs index 23a4e9fa0..688e210fe 100644 --- a/src/ipred.rs +++ b/src/ipred.rs @@ -227,33 +227,21 @@ unsafe fn splat_dc( let width = width as usize; assert!(dc <= bd.bitdepth_max().as_::()); let dc = dc.as_::(); - match BD::BPC { - BPC::BPC8 => { - if width > 4 { - for y in 0..height { - let dst = dst.offset(y * stride); - let dst = slice::from_raw_parts_mut(dst, width); - let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); - dst.fill([dc; 8]); - } - } else { - for y in 0..height { - let dst = dst.offset(y * stride); - let dst = slice::from_raw_parts_mut(dst, width); - let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); - dst.fill([dc; 4]); - } - }; + if BD::BPC == BPC::BPC8 && width > 4 { + for y in 0..height { + let dst = dst.offset(y * stride); + let dst = slice::from_raw_parts_mut(dst, width); + let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); + dst.fill([dc; 8]); } - BPC::BPC16 => { - for y in 0..height { - let dst = dst.offset(y as isize * stride); - let dst = slice::from_raw_parts_mut(dst, width); - let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); - dst.fill([dc; 4]); - } + } else { + for y in 0..height { + let dst = dst.offset(y * stride); + let dst = slice::from_raw_parts_mut(dst, width); + let dst = FromBytes::mut_slice_from(AsBytes::as_bytes_mut(dst)).unwrap(); + dst.fill([dc; 4]); } - } + }; } #[inline(never)]