diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml index d37abf0a..f8fd3be0 100644 --- a/.github/workflows/build_push.yml +++ b/.github/workflows/build_push.yml @@ -28,5 +28,20 @@ jobs: - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target i686-unknown-linux-gnu - run: cargo build --target powerpc-unknown-linux-gnu - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --features "nightly_avx512" --target x86_64-unknown-linux-gnu - - name: Test release pipeline - run: cargo publish --dry-run \ No newline at end of file + + fuzz_decoding: + name: Fuzzing decoders + strategy: + matrix: + os: [ ubuntu-latest, macos-latest ] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@nightly + - run: cargo install cargo-fuzz + - run: cargo fuzz run yuv_to_rgb -- -max_total_time=15 + - run: cargo fuzz run yuv_nv_to_rgb -- -max_total_time=15 + - run: cargo fuzz run y_to_rgb -- -max_total_time=15 + - run: cargo fuzz run yuv16_to_rgb16 -- -max_total_time=15 + - run: cargo fuzz run y16_to_rgb16 -- -max_total_time=15 + - run: cargo fuzz run yuv_to_yuyu2 -- -max_total_time=15 diff --git a/Cargo.lock b/Cargo.lock index ff84c627..04233239 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,12 +111,6 @@ dependencies = [ "arrayvec", ] -[[package]] -name = "az" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" - [[package]] name = "bindgen" version = "0.69.5" @@ -293,13 +287,6 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" -[[package]] -name = "coeffs" -version = "0.1.0" -dependencies = [ - "rug", -] - [[package]] name = "color_quant" version = "1.1.0" @@ -475,16 +462,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" -[[package]] -name = "gmp-mpfr-sys" -version = "1.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0205cd82059bc63b63cf516d714352a30c44f2c74da9961dfda2617ae6b5918" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "half" version = "2.4.1" @@ -686,12 +663,6 @@ dependencies = [ "windows-targets", ] -[[package]] -name = "libm" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" - [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -1095,18 +1066,6 @@ version = "0.8.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" -[[package]] -name = "rug" -version = "1.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ae2c1089ec0575193eb9222881310cc1ed8bce3646ef8b81b44b518595b79d" -dependencies = [ - "az", - "gmp-mpfr-sys", - "libc", - "libm", -] - [[package]] name = "rustc-hash" version = "1.1.0" @@ -1564,7 +1523,7 @@ dependencies = [ [[package]] name = "yuvutils-rs" -version = "0.5.10" +version = "0.5.11" dependencies = [ "num-traits", "rayon", diff --git a/Cargo.toml b/Cargo.toml index 573c07b2..772abc5a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ -workspace = { members = ["app", "coeffs", "fuzz"] } +workspace = { members = ["app", "fuzz"] } [package] name = "yuvutils-rs" -version = "0.5.10" +version = "0.5.11" edition = "2021" description = "High performance utilities for YUV format handling and conversion." readme = "README.md" diff --git a/app/src/main.rs b/app/src/main.rs index 98fdde01..eb89741d 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -30,12 +30,12 @@ use image::{ColorType, DynamicImage, EncodableLayout, GenericImageView, ImageRea use std::fs::File; use std::io::Read; use std::time::Instant; -use yuv_sys::{rs_I420ToRGB24, rs_NV12ToRGB24, rs_NV21ToABGR, rs_NV21ToRGB24}; +use yuv_sys::{rs_I420ToRGB24, rs_NV12ToRGB24, rs_NV21ToABGR, rs_NV21ToRGB24, rs_RGB24ToI420}; use yuvutils_rs::{ gbr_to_rgb, rgb_to_gbr, rgb_to_sharp_yuv420, rgb_to_yuv420, rgb_to_yuv420_p16, rgb_to_yuv422, rgb_to_yuv422_p16, rgb_to_yuv444, rgb_to_yuv_nv12, yuv420_p16_to_rgb16, yuv420_to_rgb, yuv420_to_yuyv422, yuv422_p16_to_rgb16, yuv422_to_rgb, yuv444_to_rgb, yuv_nv12_to_rgb, - yuv_nv12_to_rgba, yuyv422_to_yuv420, BufferStoreMut, SharpYuvGammaTransfer, + yuv_nv12_to_rgba, yuyv422_to_rgb, yuyv422_to_yuv420, BufferStoreMut, SharpYuvGammaTransfer, YuvBiPlanarImageMut, YuvBytesPacking, YuvChromaSubsampling, YuvEndianness, YuvPackedImage, YuvPackedImageMut, YuvPlanarImageMut, YuvRange, YuvStandardMatrix, }; @@ -113,7 +113,7 @@ fn main() { // println!("rgb_to_yuv_nv12 time: {:?}", start_time.elapsed()); // println!("Forward time: {:?}", start_time.elapsed()); - // // // + // // // // let full_size = if width % 2 == 0 { 2 * width as usize * height as usize } else { @@ -128,34 +128,33 @@ fn main() { 2 * (width as usize + 1) }; - // let mut yuy2_plane = vec![0u8; full_size]; + let mut yuy2_plane = vec![0u8; full_size]; // // // // // let start_time = Instant::now(); // // // // - // let plane = planar_image.to_fixed(); - // // - // let mut packed_image_mut = YuvPackedImageMut { - // yuy: BufferStoreMut::Owned(yuy2_plane), - // yuy_stride: yuy2_stride as u32, - // width, - // height, - // }; + let plane = planar_image.to_fixed(); // // - // yuv420_to_yuyv422(&mut packed_image_mut, &plane).unwrap(); + let mut packed_image_mut = YuvPackedImageMut { + yuy: BufferStoreMut::Owned(yuy2_plane), + yuy_stride: yuy2_stride as u32, + width, + height, + }; + // + yuv420_to_yuyv422(&mut packed_image_mut, &plane).unwrap(); // let end_time = Instant::now().sub(start_time); // println!("yuv420_to_yuyv422 time: {:?}", end_time); // // rgba.fill(0); // // let start_time = Instant::now(); - // yuyv422_to_rgb( - // &yuy2_plane, - // yuy2_stride as u32, - // &mut rgba, - // rgba_stride as u32, - // width, - // height, - // YuvRange::Limited, - // YuvStandardMatrix::Bt709, - // ); + let yuy2_img = packed_image_mut.to_fixed(); + yuyv422_to_rgb( + &yuy2_img, + &mut rgba, + rgba_stride as u32, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + ) + .unwrap(); // // let end_time = Instant::now().sub(start_time); // println!("yuyv422_to_rgb time: {:?}", end_time); @@ -163,18 +162,13 @@ fn main() { // let start_time = Instant::now(); // // // - // let packed_image = YuvPackedImage { - // yuy: packed_image_mut.yuy.borrow(), - // yuy_stride: yuy2_stride as u32, - // width, - // height, - // }; - // // - // yuyv422_to_yuv420(&mut planar_image, &packed_image).unwrap(); + let packed_image = packed_image_mut.to_fixed(); + // + yuyv422_to_yuv420(&mut planar_image, &packed_image).unwrap(); // // // // let end_time = Instant::now().sub(start_time); // println!("yuyv422_to_yuv444 time: {:?}", end_time); - rgba.fill(0); + // rgba.fill(0); // let mut bgra = vec![0u8; width as usize * height as usize * 4]; // let start_time = Instant::now(); // yuv420_to_rgb( @@ -241,7 +235,7 @@ fn main() { // YuvBytesPacking::LeastSignificantBytes, // ) // .unwrap(); - rgba.fill(0); + // rgba.fill(0); // ra30_to_rgb8( // &ar30, // width, @@ -258,52 +252,77 @@ fn main() { // let rgba_stride = width as usize * 4; // let mut rgba = vec![0u8; height as usize * rgba_stride]; - yuv420_to_rgb( - &fixed_planar, - &mut rgba, - rgba_stride as u32, - YuvRange::Limited, - YuvStandardMatrix::Bt601, - ) - .unwrap(); + // yuv420_to_rgb( + // &fixed_planar, + // &mut rgba, + // rgba_stride as u32, + // YuvRange::Limited, + // YuvStandardMatrix::Bt601, + // ) + // .unwrap(); println!("Backward time: {:?}", start_time.elapsed()); let start_time = Instant::now(); // unsafe { - // // rs_I420ToRGB24( - // // fixed_planar.y_plane.as_ptr(), - // // fixed_planar.y_stride as i32, - // // fixed_planar.u_plane.as_ptr(), - // // fixed_planar.u_stride as i32, - // // fixed_planar.v_plane.as_ptr(), - // // fixed_planar.v_stride as i32, - // // rgba.as_mut_ptr(), - // // rgba_stride as i32, - // // fixed_planar.width as i32, - // // fixed_planar.height as i32, - // // ); - // rs_NV12ToRGB24( - // fixed_biplanar.y_plane.as_ptr(), - // fixed_biplanar.y_stride as i32, - // fixed_biplanar.uv_plane.as_ptr(), - // fixed_biplanar.uv_stride as i32, + // let mut planar_image = + // YuvPlanarImageMut::::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420); + // + // let mut source_bgr = vec![0u8; src_bytes.len()]; + // + // rgba.chunks_exact(3).zip(source_bgr.chunks_exact_mut(3)).for_each(|(src, dst)| { + // let b = src[0]; + // dst[0] = src[2]; + // dst[1] = src[1]; + // dst[2] = b; + // }); + // + // rs_RGB24ToI420( + // src_bytes.as_ptr(), + // rgba_stride as i32, + // planar_image.y_plane.borrow_mut().as_mut_ptr(), + // planar_image.y_stride as i32, + // planar_image.u_plane.borrow_mut().as_mut_ptr(), + // planar_image.u_stride as i32, + // planar_image.v_plane.borrow_mut().as_mut_ptr(), + // planar_image.v_stride as i32, + // dimensions.0 as i32, + // dimensions.1 as i32, + // ); + // let fixed_planar = planar_image.to_fixed(); + // rs_I420ToRGB24( + // fixed_planar.y_plane.as_ptr(), + // fixed_planar.y_stride as i32, + // fixed_planar.u_plane.as_ptr(), + // fixed_planar.u_stride as i32, + // fixed_planar.v_plane.as_ptr(), + // fixed_planar.v_stride as i32, // rgba.as_mut_ptr(), // rgba_stride as i32, // fixed_planar.width as i32, // fixed_planar.height as i32, // ); + // + // // rgba.chunks_exact_mut(3).for_each(|chunk| { + // // let b = chunk[0]; + // // chunk[0] = chunk[2]; + // // chunk[2] = b; + // // }); + // // rs_NV12ToRGB24( + // // fixed_biplanar.y_plane.as_ptr(), + // // fixed_biplanar.y_stride as i32, + // // fixed_biplanar.uv_plane.as_ptr(), + // // fixed_biplanar.uv_stride as i32, + // // rgba.as_mut_ptr(), + // // rgba_stride as i32, + // // fixed_planar.width as i32, + // // fixed_planar.height as i32, + // // ); // } // / println!("Backward LIBYUV time: {:?}", start_time.elapsed()); - // rgba.chunks_exact_mut(3).for_each(|chunk| { - // let b = chunk[0]; - // chunk[0] = chunk[2]; - // chunk[2] = b; - // }); - // rgba = bytes_16.iter().map(|&x| (x >> 4) as u8).collect(); image::save_buffer( diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 0e0bf46c..13077b62 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -31,3 +31,24 @@ path = "y_to_rgb/y_to_rgb.rs" test = false doc = false bench = false + +[[bin]] +name = "yuv16_to_rgb16" +path = "yuv16_to_rgb16/yuv16_to_rgb16.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "y16_to_rgb16" +path = "y16_to_rgb16/y16_to_rgb16.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "yuv_to_yuyu2" +path = "yuv_to_yuyu2/yuv_to_yuyu2.rs" +test = false +doc = false +bench = false \ No newline at end of file diff --git a/fuzz/y16_to_rgb16/y16_to_rgb16.rs b/fuzz/y16_to_rgb16/y16_to_rgb16.rs new file mode 100644 index 00000000..33b9f976 --- /dev/null +++ b/fuzz/y16_to_rgb16/y16_to_rgb16.rs @@ -0,0 +1,82 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 12/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#![no_main] +use libfuzzer_sys::fuzz_target; +use yuvutils_rs::{ + yuv400_p16_to_rgb16, yuv400_p16_to_rgba16, YuvBytesPacking, YuvEndianness, YuvGrayImage, + YuvRange, YuvStandardMatrix, +}; + +fuzz_target!(|data: (u8, u8, u8)| { + fuzz_yuv(data.0, data.1, data.2); +}); + +fn fuzz_yuv(i_width: u8, i_height: u8, y_value: u8) { + if i_height == 0 || i_width == 0 { + return; + } + + let y_plane = vec![y_value as u16; i_height as usize * i_width as usize]; + + let planar_image = YuvGrayImage { + y_plane: &y_plane, + y_stride: i_width as u32, + width: i_width as u32, + height: i_height as u32, + }; + + let mut target_rgb = vec![0u16; i_width as usize * i_height as usize * 3]; + + yuv400_p16_to_rgb16( + &planar_image, + &mut target_rgb, + i_width as u32 * 3, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); + + let mut target_rgba = vec![0u16; i_width as usize * i_height as usize * 4]; + + yuv400_p16_to_rgba16( + &planar_image, + &mut target_rgba, + i_width as u32 * 4, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); +} diff --git a/fuzz/yuv16_to_rgb16/yuv16_to_rgb16.rs b/fuzz/yuv16_to_rgb16/yuv16_to_rgb16.rs new file mode 100644 index 00000000..e5256f8c --- /dev/null +++ b/fuzz/yuv16_to_rgb16/yuv16_to_rgb16.rs @@ -0,0 +1,187 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 12/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use yuvutils_rs::{ + yuv420_p16_to_rgb16, yuv420_p16_to_rgba16, yuv422_p16_to_rgb16, yuv422_p16_to_rgba16, + yuv444_p16_to_rgb16, yuv444_p16_to_rgba16, YuvBytesPacking, YuvEndianness, YuvPlanarImage, + YuvRange, YuvStandardMatrix, +}; + +fuzz_target!(|data: (u8, u8, u8, u8, u8, u8)| { + fuzz_yuv_420(data.0, data.1, data.2 as u16, data.3 as u16, data.4 as u16); + fuzz_yuv_422(data.0, data.1, data.2 as u16, data.3 as u16, data.4 as u16); + fuzz_yuv_444(data.0, data.1, data.2 as u16, data.3 as u16, data.4 as u16); +}); + +fn fuzz_yuv_420(i_width: u8, i_height: u8, y_value: u16, u_value: u16, v_value: u16) { + if i_height == 0 || i_width == 0 { + return; + } + let y_plane = vec![y_value; i_height as usize * i_width as usize]; + let u_plane = vec![u_value; (i_width as usize).div_ceil(2) * (i_height as usize).div_ceil(2)]; + let v_plane = vec![v_value; (i_width as usize).div_ceil(2) * (i_height as usize).div_ceil(2)]; + + let planar_image = YuvPlanarImage { + y_plane: &y_plane, + y_stride: i_width as u32, + u_plane: &u_plane, + u_stride: (i_width as u32).div_ceil(2), + v_plane: &v_plane, + v_stride: (i_width as u32).div_ceil(2), + width: i_width as u32, + height: i_height as u32, + }; + + let mut target_rgb = vec![0u16; i_width as usize * i_height as usize * 3]; + + yuv420_p16_to_rgb16( + &planar_image, + &mut target_rgb, + i_width as u32 * 3, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); + + let mut target_rgba = vec![0u16; i_width as usize * i_height as usize * 4]; + + yuv420_p16_to_rgba16( + &planar_image, + &mut target_rgba, + i_width as u32 * 4, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); +} + +fn fuzz_yuv_422(i_width: u8, i_height: u8, y_value: u16, u_value: u16, v_value: u16) { + if i_height == 0 || i_width == 0 { + return; + } + let y_plane = vec![y_value; i_height as usize * i_width as usize]; + let u_plane = vec![u_value; (i_width as usize).div_ceil(2) * i_height as usize]; + let v_plane = vec![v_value; (i_width as usize).div_ceil(2) * i_height as usize]; + + let planar_image = YuvPlanarImage { + y_plane: &y_plane, + y_stride: i_width as u32, + u_plane: &u_plane, + u_stride: (i_width as u32).div_ceil(2), + v_plane: &v_plane, + v_stride: (i_width as u32).div_ceil(2), + width: i_width as u32, + height: i_height as u32, + }; + + let mut target_rgb = vec![0u16; i_width as usize * i_height as usize * 3]; + + yuv422_p16_to_rgb16( + &planar_image, + &mut target_rgb, + i_width as u32 * 3, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); + + let mut target_rgba = vec![0u16; i_width as usize * i_height as usize * 4]; + + yuv422_p16_to_rgba16( + &planar_image, + &mut target_rgba, + i_width as u32 * 4, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); +} + +fn fuzz_yuv_444(i_width: u8, i_height: u8, y_value: u16, u_value: u16, v_value: u16) { + if i_height == 0 || i_width == 0 { + return; + } + let y_plane = vec![y_value; i_height as usize * i_width as usize]; + let u_plane = vec![u_value; i_width as usize * i_height as usize]; + let v_plane = vec![v_value; i_width as usize * i_height as usize]; + + let planar_image = YuvPlanarImage { + y_plane: &y_plane, + y_stride: i_width as u32, + u_plane: &u_plane, + u_stride: i_width as u32, + v_plane: &v_plane, + v_stride: i_width as u32, + width: i_width as u32, + height: i_height as u32, + }; + + let mut target_rgb = vec![0u16; i_width as usize * i_height as usize * 3]; + + yuv444_p16_to_rgb16( + &planar_image, + &mut target_rgb, + i_width as u32 * 3, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); + + let mut target_rgba = vec![0u16; i_width as usize * i_height as usize * 4]; + + yuv444_p16_to_rgba16( + &planar_image, + &mut target_rgba, + i_width as u32 * 4, + 10, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + YuvEndianness::LittleEndian, + YuvBytesPacking::LeastSignificantBytes, + ) + .unwrap(); +} diff --git a/fuzz/yuv_to_yuyu2/yuv_to_yuyu2.rs b/fuzz/yuv_to_yuyu2/yuv_to_yuyu2.rs new file mode 100644 index 00000000..8da435f3 --- /dev/null +++ b/fuzz/yuv_to_yuyu2/yuv_to_yuyu2.rs @@ -0,0 +1,171 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 12/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use yuvutils_rs::{ + yuv420_to_yuyv422, yuv422_to_yuyv422, yuv444_to_yuyv422, BufferStoreMut, YuvPackedImageMut, + YuvPlanarImage, +}; + +fuzz_target!(|data: (u8, u8, u8, u8, u8)| { + fuzz_yuv_420(data.0, data.1, data.2, data.3); + fuzz_yuv_422(data.0, data.1, data.2, data.3); + fuzz_yuv_444(data.0, data.1, data.2, data.3); +}); + +fn fuzz_yuv_420(i_width: u8, i_height: u8, y_value: u8, uv_value: u8) { + if i_height == 0 || i_width == 0 { + return; + } + let y_plane = vec![y_value; i_height as usize * i_width as usize]; + let u_plane = vec![uv_value; (i_width as usize).div_ceil(2) * (i_height as usize).div_ceil(2)]; + let v_plane = vec![uv_value; (i_width as usize).div_ceil(2) * (i_height as usize).div_ceil(2)]; + + let planar_image = YuvPlanarImage { + y_plane: &y_plane, + y_stride: i_width as u32, + u_plane: &u_plane, + u_stride: (i_width as u32).div_ceil(2), + v_plane: &v_plane, + v_stride: (i_width as u32).div_ceil(2), + width: i_width as u32, + height: i_height as u32, + }; + + let full_size = if i_width % 2 == 0 { + 2 * i_width as usize * i_height as usize + } else { + 2 * (i_width as usize + 1) * i_height as usize + }; + + let yuy2_stride = if i_width % 2 == 0 { + 2 * i_width as usize + } else { + 2 * (i_width as usize + 1) + }; + + let yuy2_plane = vec![0u8; full_size]; + + let mut packed_image = YuvPackedImageMut { + yuy: BufferStoreMut::Owned(yuy2_plane), + yuy_stride: yuy2_stride as u32, + width: i_width as u32, + height: i_height as u32, + }; + + yuv420_to_yuyv422(&mut packed_image, &planar_image).unwrap(); +} + +fn fuzz_yuv_422(i_width: u8, i_height: u8, y_value: u8, uv_value: u8) { + if i_height == 0 || i_width == 0 { + return; + } + let y_plane = vec![y_value; i_height as usize * i_width as usize]; + let u_plane = vec![uv_value; (i_width as usize).div_ceil(2) * i_height as usize]; + let v_plane = vec![uv_value; (i_width as usize).div_ceil(2) * i_height as usize]; + + let planar_image = YuvPlanarImage { + y_plane: &y_plane, + y_stride: i_width as u32, + u_plane: &u_plane, + u_stride: (i_width as u32).div_ceil(2), + v_plane: &v_plane, + v_stride: (i_width as u32).div_ceil(2), + width: i_width as u32, + height: i_height as u32, + }; + + let full_size = if i_width % 2 == 0 { + 2 * i_width as usize * i_height as usize + } else { + 2 * (i_width as usize + 1) * i_height as usize + }; + + let yuy2_stride = if i_width % 2 == 0 { + 2 * i_width as usize + } else { + 2 * (i_width as usize + 1) + }; + + let yuy2_plane = vec![0u8; full_size]; + + let mut packed_image = YuvPackedImageMut { + yuy: BufferStoreMut::Owned(yuy2_plane), + yuy_stride: yuy2_stride as u32, + width: i_width as u32, + height: i_height as u32, + }; + + yuv422_to_yuyv422(&mut packed_image, &planar_image).unwrap(); +} + +fn fuzz_yuv_444(i_width: u8, i_height: u8, y_value: u8, uv_value: u8) { + if i_height == 0 || i_width == 0 { + return; + } + let y_plane = vec![y_value; i_height as usize * i_width as usize]; + let u_plane = vec![uv_value; i_width as usize * i_height as usize]; + let v_plane = vec![uv_value; i_width as usize * i_height as usize]; + + let planar_image = YuvPlanarImage { + y_plane: &y_plane, + y_stride: i_width as u32, + u_plane: &u_plane, + u_stride: i_width as u32, + v_plane: &v_plane, + v_stride: i_width as u32, + width: i_width as u32, + height: i_height as u32, + }; + + let full_size = if i_width % 2 == 0 { + 2 * i_width as usize * i_height as usize + } else { + 2 * (i_width as usize + 1) * i_height as usize + }; + + let yuy2_stride = if i_width % 2 == 0 { + 2 * i_width as usize + } else { + 2 * (i_width as usize + 1) + }; + + let yuy2_plane = vec![0u8; full_size]; + + let mut packed_image = YuvPackedImageMut { + yuy: BufferStoreMut::Owned(yuy2_plane), + yuy_stride: yuy2_stride as u32, + width: i_width as u32, + height: i_height as u32, + }; + + yuv444_to_yuyv422(&mut packed_image, &planar_image).unwrap(); +} diff --git a/src/avx2/avx2_utils.rs b/src/avx2/avx2_utils.rs index 759f4351..e445ad5c 100644 --- a/src/avx2/avx2_utils.rs +++ b/src/avx2/avx2_utils.rs @@ -293,6 +293,22 @@ pub(crate) unsafe fn avx2_pairwise_widen_avg(v: __m256i) -> __m256i { _mm256_permute4x64_epi64::(packed_lo) } +#[inline(always)] +pub(crate) unsafe fn avx2_pairwise_wide_avg(v: __m256i) -> __m256i { + let ones = _mm256_set1_epi8(1); + let sums = _mm256_maddubs_epi16(v, ones); + let shifted = _mm256_srli_epi16::<1>(_mm256_add_epi16(sums, ones)); + shifted +} + +#[inline(always)] +pub(crate) unsafe fn avx_pairwise_avg_epi16(a: __m256i, b: __m256i) -> __m256i { + let sums = _mm256_hadd_epi16(a, b); + let product = _mm256_srli_epi16::<1>(_mm256_add_epi16(sums, _mm256_set1_epi16(1))); + const MASK: i32 = shuffle(3, 1, 2, 0); + _mm256_permute4x64_epi64::(product) +} + #[inline(always)] pub(crate) unsafe fn avx2_div_by255(v: __m256i) -> __m256i { let addition = _mm256_set1_epi16(127); @@ -391,3 +407,13 @@ pub(crate) unsafe fn _mm256_interleave_rgb_epi16( let bgr2 = _mm256_permute2x128_si256::<49>(p0, p2); (bgr0, p1, bgr2) } + +#[inline(always)] +pub(crate) unsafe fn _mm256_havg_epu8(a: __m256i, b: __m256i) -> __m256i { + let ones = _mm256_set1_epi8(1); + let sums_lo = _mm256_maddubs_epi16(a, ones); + let lo = _mm256_srli_epi16::<1>(_mm256_add_epi16(sums_lo, ones)); + let sums_hi = _mm256_maddubs_epi16(b, ones); + let hi = _mm256_srli_epi16::<1>(_mm256_add_epi16(sums_hi, ones)); + avx_pairwise_avg_epi16(lo, hi) +} diff --git a/src/avx2/rgb_to_nv.rs b/src/avx2/rgb_to_nv.rs index 2fdcd789..11d7d81d 100644 --- a/src/avx2/rgb_to_nv.rs +++ b/src/avx2/rgb_to_nv.rs @@ -29,6 +29,7 @@ use crate::avx2::avx2_utils::{ _mm256_deinterleave_rgba_epi8, _mm256_interleave_x2_epi8, avx2_deinterleave_rgb, avx2_pack_u16, + avx_pairwise_avg_epi16, }; use crate::internals::ProcessedOffset; use crate::yuv_support::{ @@ -290,9 +291,9 @@ unsafe fn avx2_rgba_to_nv_impl< } else if chroma_subsampling == YuvChromaSubsampling::Yuv422 || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row) { - let r1 = _mm256_avg_epu16(r_low, r_high); - let g1 = _mm256_avg_epu16(g_low, g_high); - let b1 = _mm256_avg_epu16(b_low, b_high); + let r1 = avx_pairwise_avg_epi16(r_low, r_high); + let g1 = avx_pairwise_avg_epi16(g_low, g_high); + let b1 = avx_pairwise_avg_epi16(b_low, b_high); let cb = _mm256_max_epi16( _mm256_min_epi16( _mm256_add_epi16( diff --git a/src/avx2/rgba_to_yuv.rs b/src/avx2/rgba_to_yuv.rs index ae031f4e..4dd4f398 100644 --- a/src/avx2/rgba_to_yuv.rs +++ b/src/avx2/rgba_to_yuv.rs @@ -28,7 +28,7 @@ */ use crate::avx2::avx2_utils::{ - _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, + _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, avx_pairwise_avg_epi16, }; use crate::internals::ProcessedOffset; use crate::yuv_support::{ @@ -272,9 +272,9 @@ unsafe fn avx2_rgba_to_yuv_impl( } else if chroma_subsampling == YuvChromaSubsampling::Yuv422 || (chroma_subsampling == YuvChromaSubsampling::Yuv420) { - let r1 = _mm256_avg_epu16(r_low, r_high); - let g1 = _mm256_avg_epu16(g_low, g_high); - let b1 = _mm256_avg_epu16(b_low, b_high); + let r1 = avx_pairwise_avg_epi16(r_low, r_high); + let g1 = avx_pairwise_avg_epi16(g_low, g_high); + let b1 = avx_pairwise_avg_epi16(b_low, b_high); let cb = _mm256_max_epi16( _mm256_min_epi16( _mm256_add_epi16( diff --git a/src/avx2/rgba_to_yuv420.rs b/src/avx2/rgba_to_yuv420.rs index 8480c8c5..fc2ef2ef 100644 --- a/src/avx2/rgba_to_yuv420.rs +++ b/src/avx2/rgba_to_yuv420.rs @@ -28,7 +28,7 @@ */ use crate::avx2::avx2_utils::{ - _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, + _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, avx2_pairwise_wide_avg, }; use crate::internals::ProcessedOffset; use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels}; @@ -125,7 +125,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( b_values0 = it1; } - let source_ptr1 = rgba0.get_unchecked(px..).as_ptr(); + let source_ptr1 = rgba1.get_unchecked(px..).as_ptr(); let row_11 = _mm256_loadu_si256(source_ptr1 as *const __m256i); let row_21 = _mm256_loadu_si256(source_ptr1.add(32) as *const __m256i); let row_31 = _mm256_loadu_si256(source_ptr1.add(64) as *const __m256i); @@ -291,9 +291,19 @@ unsafe fn avx2_rgba_to_yuv_impl420( y1_yuv, ); - let r_uv = _mm256_avg_epu16(r0_low, r0_high); - let g_uv = _mm256_avg_epu16(g0_low, g0_high); - let b_uv = _mm256_avg_epu16(b0_low, b0_high); + let r_uv = _mm256_slli_epi16::(_mm256_avg_epu16( + avx2_pairwise_wide_avg(r_values0), + avx2_pairwise_wide_avg(r_values1), + )); + let g_uv = _mm256_slli_epi16::(_mm256_avg_epu16( + avx2_pairwise_wide_avg(g_values0), + avx2_pairwise_wide_avg(g_values1), + )); + let b_uv = _mm256_slli_epi16::(_mm256_avg_epu16( + avx2_pairwise_wide_avg(b_values0), + avx2_pairwise_wide_avg(b_values1), + )); + let cb = _mm256_max_epi16( _mm256_min_epi16( _mm256_add_epi16( @@ -310,6 +320,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( ), i_bias_y, ); + let cr = _mm256_max_epi16( _mm256_min_epi16( _mm256_add_epi16( @@ -338,8 +349,8 @@ unsafe fn avx2_rgba_to_yuv_impl420( v_ptr.add(uv_x) as *mut _ as *mut __m128i, _mm256_castsi256_si128(cr), ); - uv_x += 16; + uv_x += 16; cx += 32; } diff --git a/src/avx2/yuv_to_yuv2.rs b/src/avx2/yuv_to_yuv2.rs index 8436ec29..bbb6c48a 100644 --- a/src/avx2/yuv_to_yuv2.rs +++ b/src/avx2/yuv_to_yuv2.rs @@ -26,7 +26,9 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::avx2::avx2_utils::{_mm256_deinterleave_x2_epi8, _mm256_store_interleaved_epi8}; +use crate::avx2::avx2_utils::{ + _mm256_deinterleave_x2_epi8, _mm256_havg_epu8, _mm256_store_interleaved_epi8, +}; use crate::yuv_support::{YuvChromaSubsampling, Yuy2Description}; use crate::yuv_to_yuy2::YuvToYuy2Navigation; #[cfg(target_arch = "x86")] @@ -64,10 +66,12 @@ pub(crate) unsafe fn yuv_to_yuy2_avx2_row_impl 32, + YuvChromaSubsampling::Yuv444 => 64, + }; unsafe { - let max_x_32 = (width as usize / 2).saturating_sub(32); - - for x in (_yuy2_x..max_x_32).step_by(32) { + while _cx + 64 < width as usize { let u_pos = _uv_x; let v_pos = _uv_x; let y_pos = _cx; @@ -93,8 +97,8 @@ pub(crate) unsafe fn yuv_to_yuy2_avx2_row_impl (v_pixels, low_y, u_pixels, high_y), }; - let dst_offset = x * 4; + let dst_offset = _cx * 2; _mm256_store_interleaved_epi8( yuy2_store.as_mut_ptr().add(dst_offset), @@ -119,17 +123,12 @@ pub(crate) unsafe fn yuv_to_yuy2_avx2_row_impl 32, - YuvChromaSubsampling::Yuv444 => 64, - }; - _cx += 64; - } + _uv_x += chroma_big_step; + _cx += 64; } + _yuy2_x = _cx; + YuvToYuy2Navigation { cx: _cx, uv_x: _uv_x, diff --git a/src/avx2/yuy2_to_rgb.rs b/src/avx2/yuy2_to_rgb.rs index 862c4c87..654c31de 100644 --- a/src/avx2/yuy2_to_rgb.rs +++ b/src/avx2/yuy2_to_rgb.rs @@ -70,8 +70,6 @@ unsafe fn yuy2_to_rgb_avx_impl let mut _yuy2_x = nav.x; unsafe { - let max_x_32 = (width as usize / 2).saturating_sub(32); - let y_corr = _mm256_set1_epi8(range.bias_y as i8); let uv_corr = _mm256_set1_epi16(range.bias_uv as i16); let v_luma_coeff = _mm256_set1_epi16(transform.y_coef as i16); @@ -83,8 +81,8 @@ unsafe fn yuy2_to_rgb_avx_impl let zeros = _mm256_setzero_si256(); let rounding_const = _mm256_set1_epi16((1 << 5) - 1); - for x in (_yuy2_x..max_x_32).step_by(32) { - let yuy2_offset = x * 4; + while _cx + 64 < width as usize { + let yuy2_offset = _cx * 2; let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); @@ -347,11 +345,10 @@ unsafe fn yuy2_to_rgb_avx_impl } } - _yuy2_x = x; - if x + 32 < max_x_32 { - _cx += 64; - } + _cx += 64; } + + _yuy2_x = _cx; } YuvToYuy2Navigation { diff --git a/src/avx2/yuy2_to_yuv.rs b/src/avx2/yuy2_to_yuv.rs index eacb6449..f53c0029 100644 --- a/src/avx2/yuy2_to_yuv.rs +++ b/src/avx2/yuy2_to_yuv.rs @@ -67,10 +67,8 @@ unsafe fn yuy2_to_yuv_avx_impl( let mut _uv_x = nav.uv_x; let mut _yuy2_x = nav.x; - let max_x_32 = (width as usize / 2).saturating_sub(32); - - for x in (_yuy2_x..max_x_32).step_by(32) { - let dst_offset = x * 4; + while _cx + 64 < width as usize { + let dst_offset = _cx * 2; let u_pos = _uv_x; let v_pos = _uv_x; let y_pos = _cx; @@ -128,16 +126,15 @@ unsafe fn yuy2_to_yuv_avx_impl( _mm256_storeu_si256(y_plane_ptr as *mut __m256i, y_first); _mm256_storeu_si256(y_plane_ptr.add(32) as *mut __m256i, y_second); - _yuy2_x = x; - if x + 32 < max_x_32 { - _uv_x += match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 32, - YuvChromaSubsampling::Yuv444 => 64, - }; - _cx += 64; - } + _uv_x += match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 32, + YuvChromaSubsampling::Yuv444 => 64, + }; + _cx += 64; } + _yuy2_x = _cx; + YuvToYuy2Navigation { cx: _cx, uv_x: _uv_x, diff --git a/src/images.rs b/src/images.rs index 0ac7811d..e0066ec4 100644 --- a/src/images.rs +++ b/src/images.rs @@ -513,4 +513,13 @@ where check_yuv_packed(self.yuy.borrow(), self.yuy_stride, self.width, self.height)?; Ok(()) } + + pub fn to_fixed(&self) -> YuvPackedImage { + YuvPackedImage { + yuy: self.yuy.borrow(), + yuy_stride: self.yuy_stride, + width: self.width, + height: self.height, + } + } } diff --git a/src/neon/neon_simd_support.rs b/src/neon/neon_simd_support.rs index e07a8a74..4ffae877 100644 --- a/src/neon/neon_simd_support.rs +++ b/src/neon/neon_simd_support.rs @@ -149,3 +149,14 @@ pub(crate) unsafe fn vldq_s16_endian uint8x16x2_t { + uint8x16x2_t(vld1q_u8(src), vld1q_u8(src.add(16))) +} + +#[inline(always)] +pub(crate) unsafe fn xvst1q_u8_x2(ptr: *mut u8, b: uint8x16x2_t) { + vst1q_u8(ptr, b.0); + vst1q_u8(ptr.add(16), b.1); +} diff --git a/src/neon/rgba_to_yuv420.rs b/src/neon/rgba_to_yuv420.rs index 78b00323..79bf90ae 100644 --- a/src/neon/rgba_to_yuv420.rs +++ b/src/neon/rgba_to_yuv420.rs @@ -181,11 +181,11 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420(vrshrq_n_u16::<1>(box_r_values))); - let box_g_values = vpaddlq_u8(g_values0); + let box_g_values = vhaddq_u16(vpaddlq_u8(g_values0), vpaddlq_u8(g_values1)); let g1 = vreinterpretq_s16_u16(vshlq_n_u16::(vrshrq_n_u16::<1>(box_g_values))); - let box_b_values = vpaddlq_u8(b_values0); + let box_b_values = vhaddq_u16(vpaddlq_u8(b_values0), vpaddlq_u8(b_values1)); let b1 = vreinterpretq_s16_u16(vshlq_n_u16::(vrshrq_n_u16::<1>(box_b_values))); let mut cbl = vqrdmlahq_laneq_s16::<3>(uv_bias, r1, v_weights); @@ -210,7 +210,6 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420(vpaddlq_u8(r_values0))); - let g1 = vreinterpretq_s16_u16(vrshrq_n_u16::<1>(vpaddlq_u8(g_values0))); - let b1 = vreinterpretq_s16_u16(vrshrq_n_u16::<1>(vpaddlq_u8(b_values0))); + let r1 = vreinterpretq_s16_u16(vrshrq_n_u16::<1>(vhaddq_u16( + vpaddlq_u8(r_values0), + vpaddlq_u8(r_values1), + ))); + let g1 = vreinterpretq_s16_u16(vrshrq_n_u16::<1>(vhaddq_u16( + vpaddlq_u8(g_values0), + vpaddlq_u8(g_values1), + ))); + let b1 = vreinterpretq_s16_u16(vrshrq_n_u16::<1>(vhaddq_u16( + vpaddlq_u8(b_values0), + vpaddlq_u8(b_values1), + ))); let mut cb_h = vmlal_high_laneq_s16::<3>(uv_bias, r1, v_weights); cb_h = vmlal_high_laneq_s16::<4>(cb_h, g1, v_weights); diff --git a/src/neon/yuv_to_yuy2.rs b/src/neon/yuv_to_yuy2.rs index 581d2103..d48d408b 100644 --- a/src/neon/yuv_to_yuy2.rs +++ b/src/neon/yuv_to_yuy2.rs @@ -26,19 +26,16 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::neon::neon_simd_support::xvld1q_u8_x2; use crate::yuv_support::{YuvChromaSubsampling, Yuy2Description}; use crate::yuv_to_yuy2::YuvToYuy2Navigation; use std::arch::aarch64::*; pub(crate) fn yuv_to_yuy2_neon_impl( y_plane: &[u8], - y_offset: usize, u_plane: &[u8], - u_offset: usize, v_plane: &[u8], - v_offset: usize, yuy2_store: &mut [u8], - yuy2_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { @@ -47,27 +44,34 @@ pub(crate) fn yuv_to_yuy2_neon_impl 16, + YuvChromaSubsampling::Yuv444 => 32, + }; + + let chroma_small_step_size = match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 8, + YuvChromaSubsampling::Yuv444 => 16, + }; + let mut _cx = nav.cx; let mut _uv_x = nav.uv_x; let mut _yuy2_x = nav.x; unsafe { let v_shuffle = vld1q_u8(shuffle_table.as_ptr()); - let max_x_16 = (width as usize / 2).saturating_sub(16); - let max_x_8 = (width as usize / 2).saturating_sub(8); - - for x in (_yuy2_x..max_x_16).step_by(16) { - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; + while _cx + 32 < width as usize { + let u_pos = _uv_x; + let v_pos = _uv_x; + let y_pos = _cx; let u_pixels; let v_pixels; - let y_pixels = vld1q_u8_x2(y_plane.as_ptr().add(y_pos)); + let y_pixels = xvld1q_u8_x2(y_plane.as_ptr().add(y_pos)); if chroma_subsampling == YuvChromaSubsampling::Yuv444 { - let full_u = vld1q_u8_x2(u_plane.as_ptr().add(u_pos)); - let full_v = vld1q_u8_x2(v_plane.as_ptr().add(v_pos)); + let full_u = xvld1q_u8_x2(u_plane.as_ptr().add(u_pos)); + let full_v = xvld1q_u8_x2(v_plane.as_ptr().add(v_pos)); u_pixels = vhaddq_u8(full_u.0, full_u.1); v_pixels = vhaddq_u8(full_v.0, full_v.1); @@ -89,25 +93,17 @@ pub(crate) fn yuv_to_yuy2_neon_impl uint8x16x4_t(v_pixels, low_y, u_pixels, high_y), }; - let dst_offset = yuy2_offset + x * 4; + let dst_offset = _cx * 2; vst4q_u8(yuy2_store.as_mut_ptr().add(dst_offset), storage); - - _yuy2_x = x; - - if x + 16 < max_x_16 { - _uv_x += match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 16, - YuvChromaSubsampling::Yuv444 => 32, - }; - _cx += 32; - } + _cx += 32; + _uv_x += chroma_big_step_size; } - for x in (_yuy2_x..max_x_8).step_by(8) { - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; + while _cx + 16 < width as usize { + let u_pos = _uv_x; + let v_pos = _uv_x; + let y_pos = _cx; let u_pixels; let v_pixels; @@ -144,20 +140,15 @@ pub(crate) fn yuv_to_yuy2_neon_impl uint8x8x4_t(v_pixels, low_y, u_pixels, high_y), }; - let dst_offset = yuy2_offset + x * 4; + let dst_offset = _cx * 2; vst4_u8(yuy2_store.as_mut_ptr().add(dst_offset), storage); - _yuy2_x = x; - - if x + 8 < max_x_8 { - _uv_x += match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 8, - YuvChromaSubsampling::Yuv444 => 16, - }; - _cx += 16; - } + _cx += 16; + _uv_x += chroma_small_step_size; } + + _yuy2_x = _cx; } YuvToYuy2Navigation { diff --git a/src/neon/yuy2_to_rgb.rs b/src/neon/yuy2_to_rgb.rs index 038a8415..442857fd 100644 --- a/src/neon/yuy2_to_rgb.rs +++ b/src/neon/yuy2_to_rgb.rs @@ -51,9 +51,6 @@ pub(crate) fn yuy2_to_rgb_neon< let mut _yuy2_x = nav.x; unsafe { - let max_x_16 = (width as usize / 2).saturating_sub(16); - let max_x_8 = (width as usize / 2).saturating_sub(8); - let y_corr = vdupq_n_u8(range.bias_y as u8); let uv_corr = vdupq_n_s16(range.bias_uv as i16); let v_luma_coeff = vdupq_n_u8(transform.y_coef as u8); @@ -64,12 +61,12 @@ pub(crate) fn yuy2_to_rgb_neon< let v_g_coeff_2 = vdupq_n_s16(-(transform.g_coeff_2 as i16)); let v_alpha = vdupq_n_u8(255u8); - for x in (_yuy2_x..max_x_16).step_by(16) { - let dst_offset = x * 4; + while _cx + 32 < width as usize { + let yuy2_offset = _cx * 2; let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); - let pixel_set = vld4q_u8(yuy2_store.as_ptr().add(dst_offset)); + let pixel_set = vld4q_u8(yuy2_store.as_ptr().add(yuy2_offset)); let mut y_first = match yuy2_source { Yuy2Description::YUYV | Yuy2Description::YVYU => pixel_set.0, Yuy2Description::UYVY | Yuy2Description::VYUY => pixel_set.1, @@ -245,18 +242,15 @@ pub(crate) fn yuy2_to_rgb_neon< } } - _yuy2_x = x; - if x + 16 < max_x_16 { - _cx += 32; - } + _cx += 32; } - for x in (_yuy2_x..max_x_8).step_by(8) { - let dst_offset = x * 4; + while _cx + 16 < width as usize { + let yuy2_offset = _cx * 2; let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); - let pixel_set = vld4_u8(yuy2_store.as_ptr().add(dst_offset)); + let pixel_set = vld4_u8(yuy2_store.as_ptr().add(yuy2_offset)); let mut y_first = match yuy2_source { Yuy2Description::YUYV | Yuy2Description::YVYU => pixel_set.0, Yuy2Description::UYVY | Yuy2Description::VYUY => pixel_set.1, @@ -355,11 +349,9 @@ pub(crate) fn yuy2_to_rgb_neon< } } - _yuy2_x = x; - if x + 8 < max_x_8 { - _cx += 16; - } + _cx += 16; } + _yuy2_x = _cx; } YuvToYuy2Navigation { diff --git a/src/neon/yuy2_to_yuv.rs b/src/neon/yuy2_to_yuv.rs index 30198aee..f2bace0b 100644 --- a/src/neon/yuy2_to_yuv.rs +++ b/src/neon/yuy2_to_yuv.rs @@ -26,6 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::neon::neon_simd_support::xvst1q_u8_x2; use crate::yuv_support::{YuvChromaSubsampling, Yuy2Description}; use crate::yuv_to_yuy2::YuvToYuy2Navigation; use std::arch::aarch64::*; @@ -46,11 +47,8 @@ pub(crate) fn yuy2_to_yuv_neon_impl pixel_set.0, }; - vst1q_u8_x2( + xvst1q_u8_x2( y_plane.as_mut_ptr().add(y_pos), uint8x16x2_t(y_first, y_second), ); @@ -93,11 +91,11 @@ pub(crate) fn yuy2_to_yuv_neon_impl 16, - YuvChromaSubsampling::Yuv444 => 32, - }; - _cx += 32; - } + _uv_x += match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 16, + YuvChromaSubsampling::Yuv444 => 32, + }; + _cx += 32; } - for x in (_yuy2_x..max_x_8).step_by(8) { - let dst_offset = x * 4; + while _cx + 16 < width as usize { + let dst_offset = _cx * 2; let u_pos = _uv_x; let v_pos = _uv_x; let y_pos = _cx; @@ -173,15 +168,14 @@ pub(crate) fn yuy2_to_yuv_neon_impl 8, - YuvChromaSubsampling::Yuv444 => 16, - }; - _cx += 16; - } + _uv_x += match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 8, + YuvChromaSubsampling::Yuv444 => 16, + }; + _cx += 16; } + + _yuy2_x = _cx; } YuvToYuy2Navigation { diff --git a/src/rgba_to_yuv.rs b/src/rgba_to_yuv.rs index 13befbc1..13e835f0 100644 --- a/src/rgba_to_yuv.rs +++ b/src/rgba_to_yuv.rs @@ -380,9 +380,9 @@ fn rgbx_to_yuv8( >> PRECISION; y_dst1[1] = y_11.max(i_bias_y).min(i_cap_y) as u8; - let ruv = (r00 + r01 + 1) >> 1; - let guv = (g00 + g01 + 1) >> 1; - let buv = (b00 + b01 + 1) >> 1; + let ruv = (r00 + r01 + r10 + r11 + 2) >> 2; + let guv = (g00 + g01 + g10 + g11 + 2) >> 2; + let buv = (b00 + b01 + b10 + b11 + 2) >> 2; let cb = (ruv * transform.cb_r + guv * transform.cb_g + buv * transform.cb_b + bias_uv) >> PRECISION; @@ -416,6 +416,10 @@ fn rgbx_to_yuv8( (r1 * transform.yr + g1 * transform.yg + b1 * transform.yb + bias_y) >> PRECISION; *y1_last = y_1.max(i_bias_y).min(i_cap_y) as u8; + let r0 = (r0 + r1) >> 1; + let g0 = (g0 + g1) >> 1; + let b0 = (b0 + b1) >> 1; + let cb = (r0 * transform.cb_r + g0 * transform.cb_g + b0 * transform.cb_b + bias_uv) >> PRECISION; let cr = (r0 * transform.cr_r + g0 * transform.cr_g + b0 * transform.cr_b + bias_uv) diff --git a/src/sse/rgb_to_nv.rs b/src/sse/rgb_to_nv.rs index 94b6d928..85cfff55 100644 --- a/src/sse/rgb_to_nv.rs +++ b/src/sse/rgb_to_nv.rs @@ -28,6 +28,7 @@ */ use crate::internals::ProcessedOffset; +use crate::sse::sse_pairwise_avg_epi16; use crate::sse::sse_support::{sse_deinterleave_rgb, sse_deinterleave_rgba}; use crate::yuv_support::{ CbCrForwardTransform, YuvChromaRange, YuvChromaSubsampling, YuvNVOrder, YuvSourceChannels, @@ -286,9 +287,9 @@ unsafe fn sse_rgba_to_nv_row_impl< } else if chroma_subsampling == YuvChromaSubsampling::Yuv422 || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row) { - let r1 = _mm_avg_epu16(r_low, r_high); - let g1 = _mm_avg_epu16(g_low, g_high); - let b1 = _mm_avg_epu16(b_low, b_high); + let r1 = sse_pairwise_avg_epi16(r_low, r_high); + let g1 = sse_pairwise_avg_epi16(g_low, g_high); + let b1 = sse_pairwise_avg_epi16(b_low, b_high); let cbk = _mm_max_epi16( _mm_min_epi16( diff --git a/src/sse/rgba_to_yuv.rs b/src/sse/rgba_to_yuv.rs index 9fa834cc..cbf600d1 100644 --- a/src/sse/rgba_to_yuv.rs +++ b/src/sse/rgba_to_yuv.rs @@ -28,6 +28,7 @@ */ use crate::internals::ProcessedOffset; +use crate::sse::sse_pairwise_avg_epi16; use crate::sse::sse_support::{sse_deinterleave_rgb, sse_deinterleave_rgba}; use crate::yuv_support::{ CbCrForwardTransform, YuvChromaRange, YuvChromaSubsampling, YuvSourceChannels, @@ -261,9 +262,9 @@ unsafe fn sse_rgba_to_yuv_row_impl( y1_yuv, ); - let r1 = _mm_avg_epu16(r0_low, r0_high); - let g1 = _mm_avg_epu16(g0_low, g0_high); - let b1 = _mm_avg_epu16(b0_low, b0_high); + let r1 = _mm_slli_epi16::(_mm_avg_epu16( + sse_pairwise_wide_avg(r_values0), + sse_pairwise_wide_avg(r_values1), + )); + let g1 = _mm_slli_epi16::(_mm_avg_epu16( + sse_pairwise_wide_avg(g_values0), + sse_pairwise_wide_avg(g_values1), + )); + let b1 = _mm_slli_epi16::(_mm_avg_epu16( + sse_pairwise_wide_avg(b_values0), + sse_pairwise_wide_avg(b_values1), + )); let cbk = _mm_max_epi16( _mm_min_epi16( diff --git a/src/sse/sse_support.rs b/src/sse/sse_support.rs index 0ab2e04e..61364ad0 100644 --- a/src/sse/sse_support.rs +++ b/src/sse/sse_support.rs @@ -237,6 +237,23 @@ pub(crate) unsafe fn sse_pairwise_widen_avg(v: __m128i) -> __m128i { _mm_packus_epi16(shifted, shifted) } +#[inline(always)] +pub(crate) unsafe fn sse_pairwise_wide_avg(v: __m128i) -> __m128i { + let ones = _mm_set1_epi8(1); + let sums = _mm_maddubs_epi16(v, ones); + _mm_srli_epi16::<1>(_mm_add_epi16(sums, ones)) +} + +#[inline(always)] +pub(crate) unsafe fn _mm_havg_epu8(a: __m128i, b: __m128i) -> __m128i { + let ones = _mm_set1_epi8(1); + let sums_lo = _mm_maddubs_epi16(a, ones); + let lo = _mm_srli_epi16::<1>(_mm_add_epi16(sums_lo, ones)); + let sums_hi = _mm_maddubs_epi16(b, ones); + let hi = _mm_srli_epi16::<1>(_mm_add_epi16(sums_hi, ones)); + _mm_packus_epi16(lo, hi) +} + #[inline(always)] pub(crate) unsafe fn sse_div_by255(v: __m128i) -> __m128i { let addition = _mm_set1_epi16(127); diff --git a/src/sse/yuv_to_yuy2.rs b/src/sse/yuv_to_yuy2.rs index 1aa4f089..9ee6ef4d 100644 --- a/src/sse/yuv_to_yuy2.rs +++ b/src/sse/yuv_to_yuy2.rs @@ -26,6 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::sse::_mm_havg_epu8; use crate::sse::sse_support::{ __mm128x4, _mm_combineh_epi8, _mm_combinel_epi8, _mm_gethigh_epi8, _mm_getlow_epi8, _mm_loadu_si128_x2, _mm_storeu_si128_x4, sse_interleave_rgba, @@ -67,15 +68,23 @@ unsafe fn yuv_to_yuy2_sse_impl( let mut _cx = nav.cx; let mut _uv_x = nav.uv_x; let mut _yuy2_x = nav.x; + + let chroma_big_step_size = match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 16, + YuvChromaSubsampling::Yuv444 => 32, + }; + + let chroma_small_step_size = match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 8, + YuvChromaSubsampling::Yuv444 => 16, + }; + unsafe { #[rustfmt::skip] let v_shuffle = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let max_x_16 = (width as usize / 2).saturating_sub(16); - let max_x_8 = (width as usize / 2).saturating_sub(8); - - for x in (_yuy2_x..max_x_16).step_by(16) { + while _cx + 32 < width as usize { let u_pos = _uv_x; let v_pos = _uv_x; let y_pos = _cx; @@ -88,8 +97,8 @@ unsafe fn yuv_to_yuy2_sse_impl( let full_u = _mm_loadu_si128_x2(u_plane.as_ptr().add(u_pos)); let full_v = _mm_loadu_si128_x2(v_plane.as_ptr().add(v_pos)); - u_pixels = _mm_avg_epu8(full_u.0, full_u.1); - v_pixels = _mm_avg_epu8(full_v.0, full_v.1); + u_pixels = _mm_havg_epu8(full_u.0, full_u.1); + v_pixels = _mm_havg_epu8(full_v.0, full_v.1); } else { u_pixels = _mm_loadu_si128(u_plane.as_ptr().add(u_pos) as *const __m128i); v_pixels = _mm_loadu_si128(v_plane.as_ptr().add(v_pos) as *const __m128i); @@ -108,25 +117,17 @@ unsafe fn yuv_to_yuy2_sse_impl( Yuy2Description::VYUY => __mm128x4(v_pixels, low_y, u_pixels, high_y), }; - let dst_offset = x * 4; + let dst_offset = _cx * 2; let inverleaved = sse_interleave_rgba(storage.0, storage.1, storage.2, storage.3); let converted = __mm128x4(inverleaved.0, inverleaved.1, inverleaved.2, inverleaved.3); _mm_storeu_si128_x4(yuy2_store.as_mut_ptr().add(dst_offset), converted); - - _yuy2_x = x; - - if x + 16 < max_x_16 { - _uv_x += match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 16, - YuvChromaSubsampling::Yuv444 => 32, - }; - _cx += 32; - } + _cx += 32; + _uv_x += chroma_big_step_size; } - for x in (_yuy2_x..max_x_8).step_by(8) { + while _cx + 16 < width as usize { let u_pos = _uv_x; let v_pos = _uv_x; let y_pos = _cx; @@ -143,12 +144,12 @@ unsafe fn yuv_to_yuy2_sse_impl( let low_u = _mm_getlow_epi8(full_u); let high_u = _mm_gethigh_epi8(full_u); - u_pixels = _mm_avg_epu8(low_u, high_u); + u_pixels = _mm_havg_epu8(low_u, high_u); let low_v = _mm_getlow_epi8(full_v); let high_v = _mm_gethigh_epi8(full_v); - v_pixels = _mm_avg_epu8(low_v, high_v); + v_pixels = _mm_havg_epu8(low_v, high_v); } else { u_pixels = _mm_loadu_si64(u_plane.as_ptr().add(u_pos)); v_pixels = _mm_loadu_si64(v_plane.as_ptr().add(v_pos)); @@ -169,23 +170,18 @@ unsafe fn yuv_to_yuy2_sse_impl( let inverleaved = sse_interleave_rgba(storage.0, storage.1, storage.2, storage.3); let converted = __mm128x4(inverleaved.0, inverleaved.1, inverleaved.2, inverleaved.3); - let dst_offset = x * 4; + let dst_offset = _cx * 2; let ptr = yuy2_store.as_mut_ptr().add(dst_offset); _mm_storeu_si128(ptr as *mut __m128i, converted.0); _mm_storeu_si128(ptr.add(16) as *mut __m128i, converted.1); - _yuy2_x = x; - - if x + 8 < max_x_8 { - _uv_x += match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 8, - YuvChromaSubsampling::Yuv444 => 16, - }; - _cx += 16; - } + _cx += 16; + _uv_x += chroma_small_step_size; } + + _yuy2_x = _cx; } YuvToYuy2Navigation { diff --git a/src/sse/yuy2_to_rgb.rs b/src/sse/yuy2_to_rgb.rs index 9eac582b..e33d16dc 100644 --- a/src/sse/yuy2_to_rgb.rs +++ b/src/sse/yuy2_to_rgb.rs @@ -67,9 +67,6 @@ unsafe fn yuy2_to_rgb_sse_impl let mut _yuy2_x = nav.x; unsafe { - let max_x_16 = (width as usize / 2).saturating_sub(16); - let max_x_8 = (width as usize / 2).saturating_sub(8); - let y_corr = _mm_set1_epi8(range.bias_y as i8); let uv_corr = _mm_set1_epi16(range.bias_uv as i16); let v_luma_coeff = _mm_set1_epi16(transform.y_coef as i16); @@ -82,8 +79,8 @@ unsafe fn yuy2_to_rgb_sse_impl let zeros = _mm_setzero_si128(); - for x in (_yuy2_x..max_x_16).step_by(16) { - let yuy2_offset = x * 4; + while _cx + 32 < width as usize { + let yuy2_offset = _cx * 2; let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); @@ -328,14 +325,11 @@ unsafe fn yuy2_to_rgb_sse_impl } } - _yuy2_x = x; - if x + 16 < max_x_16 { - _cx += 32; - } + _cx += 32; } - for x in (_yuy2_x..max_x_8).step_by(8) { - let yuy2_offset = x * 4; + while _cx + 16 < width as usize { + let yuy2_offset = _cx * 2; let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); @@ -472,11 +466,10 @@ unsafe fn yuy2_to_rgb_sse_impl } } - _yuy2_x = x; - if x + 8 < max_x_8 { - _cx += 16; - } + _cx += 16; } + + _yuy2_x = _cx; } YuvToYuy2Navigation { diff --git a/src/sse/yuy2_to_yuv.rs b/src/sse/yuy2_to_yuv.rs index bb49217c..32dfa66e 100644 --- a/src/sse/yuy2_to_yuv.rs +++ b/src/sse/yuy2_to_yuv.rs @@ -66,16 +66,13 @@ unsafe fn yuy2_to_yuv_sse_impl( let mut _yuy2_x = nav.x; unsafe { - let max_x_16 = (width as usize / 2).saturating_sub(16); - let max_x_8 = (width as usize / 2).saturating_sub(8); - - for x in (_yuy2_x..max_x_16).step_by(16) { - let yuy2_offset = x * 4; + while _cx + 32 < width as usize { + let dst_offset = _cx * 2; let u_pos = _uv_x; let v_pos = _uv_x; let y_pos = _cx; - let yuy2_ptr = yuy2_store.as_ptr().add(yuy2_offset); + let yuy2_ptr = yuy2_store.as_ptr().add(dst_offset); let j0 = _mm_loadu_si128(yuy2_ptr as *const __m128i); let j1 = _mm_loadu_si128(yuy2_ptr.add(16) as *const __m128i); @@ -133,23 +130,20 @@ unsafe fn yuy2_to_yuv_sse_impl( _mm_storeu_si128(y_plane_ptr as *mut __m128i, y_first); _mm_storeu_si128(y_plane_ptr.add(16) as *mut __m128i, y_second); - _yuy2_x = x; - if x + 16 < max_x_16 { - _uv_x += match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 16, - YuvChromaSubsampling::Yuv444 => 32, - }; - _cx += 32; - } + _uv_x += match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 16, + YuvChromaSubsampling::Yuv444 => 32, + }; + _cx += 32; } - for x in (_yuy2_x..max_x_8).step_by(8) { - let yuy2_offset = x * 4; + while _cx + 16 < width as usize { + let dst_offset = _cx * 2; let u_pos = _uv_x; let v_pos = _uv_x; let y_pos = _cx; - let yuy2_ptr = yuy2_store.as_ptr().add(yuy2_offset); + let yuy2_ptr = yuy2_store.as_ptr().add(dst_offset); let j0 = _mm_loadu_si128(yuy2_ptr as *const __m128i); let j1 = _mm_loadu_si128(yuy2_ptr.add(16) as *const __m128i); @@ -203,15 +197,14 @@ unsafe fn yuy2_to_yuv_sse_impl( ); } - _yuy2_x = x; - if x + 8 < max_x_8 { - _uv_x += match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 8, - YuvChromaSubsampling::Yuv444 => 16, - }; - _cx += 16; - } + _uv_x += match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => 8, + YuvChromaSubsampling::Yuv444 => 16, + }; + _cx += 16; } + + _yuy2_x = _cx; } YuvToYuy2Navigation { diff --git a/src/yuv_to_yuy2.rs b/src/yuv_to_yuy2.rs index 7e3249e4..51c3d818 100644 --- a/src/yuv_to_yuy2.rs +++ b/src/yuv_to_yuy2.rs @@ -94,9 +94,9 @@ impl ProcessWideRow for u8 { let mut _processed = 0usize; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let mut _use_sse = is_x86_feature_detected!("sse4.1"); + let _use_sse = is_x86_feature_detected!("sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let mut _use_avx2 = is_x86_feature_detected!("avx2"); + let _use_avx2 = is_x86_feature_detected!("avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { @@ -128,13 +128,9 @@ impl ProcessWideRow for u8 { { let processed = yuv_to_yuy2_neon_impl::( _y_src, - 0, _u_src, - 0, _v_src, - 0, _yuy2, - 0, _width as u32, YuvToYuy2Navigation::new(0, 0, 0), ); @@ -332,7 +328,7 @@ pub(crate) fn yuv_to_yuy2_impl< .zip(y_src.chunks_exact(2)) .zip(u_src.iter()) .zip(v_src.iter()) - .skip(processed) + .skip(processed / 2) { yuy2[yuy2_target.get_first_y_position()] = y_src[0]; yuy2[yuy2_target.get_second_y_position()] = y_src[1]; @@ -422,7 +418,7 @@ pub(crate) fn yuv_to_yuy2_impl< .zip(y_src.chunks_exact(2)) .zip(u_src.iter()) .zip(v_src.iter()) - .skip(processed) + .skip(processed / 2) { yuy2[yuy2_target.get_first_y_position()] = y_src[0]; yuy2[yuy2_target.get_second_y_position()] = y_src[1]; @@ -483,7 +479,7 @@ pub(crate) fn yuv_to_yuy2_impl< .zip(rem_y.chunks_exact(2)) .zip(last_u.iter()) .zip(last_v.iter()) - .skip(processed) + .skip(processed / 2) { yuy2[yuy2_target.get_first_y_position()] = y_src[0]; yuy2[yuy2_target.get_second_y_position()] = y_src[1]; diff --git a/src/yuy2_to_rgb.rs b/src/yuy2_to_rgb.rs index 471312b8..dbbe58fb 100644 --- a/src/yuy2_to_rgb.rs +++ b/src/yuy2_to_rgb.rs @@ -80,9 +80,9 @@ fn yuy2_to_rgb_impl( let bias_uv = range.bias_uv as i32; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1"); + let _use_sse = std::arch::is_x86_feature_detected!("sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let mut _use_avx = std::arch::is_x86_feature_detected!("avx2"); + let _use_avx = std::arch::is_x86_feature_detected!("avx2"); let rgb_iter; let yuy2_iter; @@ -159,7 +159,7 @@ fn yuy2_to_rgb_impl( for (rgb, yuy2) in rgb_store .chunks_exact_mut(2 * channels) .zip(yuy2_store.chunks_exact(4)) - .skip(_cx) + .skip(_cx / 2) { let first_y = yuy2[yuy2_source.get_first_y_position()]; let second_y = yuy2[yuy2_source.get_second_y_position()];